In [90]:
### setup the working env using Merck proxy
import os
os.environ['http_proxy'] = "http://webproxy.merck.com:8080"     
os.environ['https_proxy'] = "http://webproxy.merck.com:8080"

import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import psycopg2

### For blank plot issue
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode()

from pycaret.utils import enable_colab
enable_colab()

## To display the whole dataframe
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
Colab mode enabled.
In [530]:
df1= pd.read_csv('DIRECT_INPUT_BOT_EMAIL.csv')


print(df1.nunique())
print(df1.columns)
print(df1.info())
print(df1.shape)
df1.isnull().sum()
EMAIL_TYPE           1
ClickId_AE           0
ClickId_ME      176876
SentId          100104
BOT_ACTIVITY         5
EIS_ID          176876
NON_BOT              2
dtype: int64
Index(['EMAIL_TYPE', 'ClickId_AE', 'ClickId_ME', 'SentId', 'BOT_ACTIVITY',
       'EIS_ID', 'NON_BOT'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176876 entries, 0 to 176875
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   EMAIL_TYPE    176876 non-null  object 
 1   ClickId_AE    0 non-null       float64
 2   ClickId_ME    176876 non-null  int64  
 3   SentId        176876 non-null  object 
 4   BOT_ACTIVITY  172468 non-null  object 
 5   EIS_ID        176876 non-null  int64  
 6   NON_BOT       176876 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 9.4+ MB
None
(176876, 7)
Out[530]:
EMAIL_TYPE           0
ClickId_AE      176876
ClickId_ME           0
SentId               0
BOT_ACTIVITY      4408
EIS_ID               0
NON_BOT              0
dtype: int64
In [ ]:
#df1.describe()

Check for duplicates and bounced

In [543]:
#del df1['BOT_ACTIVITY']
duplicate = df1[df1.duplicated()]
print(duplicate)
Empty DataFrame
Columns: [EMAIL_TYPE, ClickId_AE, ClickId_ME, SentId, BOT_ACTIVITY, EIS_ID, NON_BOT]
Index: []

What does the Target var look like?

In [542]:
# Non_Promotional_click

df1['BOT_ACTIVITY'].value_counts()
Out[542]:
false                        164934
Bot_click_model                4690
Bot_open_model                 1925
Bot_click_Honeypot              470
Bot_click_Honeypot_around       449
Name: BOT_ACTIVITY, dtype: int64
In [544]:
# NA label
df1['BOT_ACTIVITY'].isnull().sum()
Out[544]:
4408

Project initial questions:

  • "false" includes "human" and raw data is 176k from 03/2020 - 11/2021. After cleansing is 85k. How many/which events have honeypot links? When was is embeded first?
  • Table containing all Marketing E-mail action fact data(its SFMC not Approved...)? ref: dict
  • If a bot clicked a link, it should had opend it as well?
  • So "Bot_open_model" should be an "open event" without click and "Bot_click_model" is open and click?
  • Can we add them together then? - German rules were applied on "all activities" and "clicks only" -
  • How they implemented German rules - open/clicks_model (domain,usless links..)?
  • Honeypot labels show "Pure" bot activity? No they show bot activity. No human? maybe. Can we detect their URL?
  • "Bot_click_Honeypot" and "Bot_click_Honeypot_around" are same bot same time same link, so are they duplicates in the data? 157 duplicates
  • One class SVM anomaly/outlier detection vs Over/under sampling? 2k data?
  • Non_Promotional_click is removed by Netherland
  • Indication if the click was unique for the user, for the used newsletter.
  • Indication if the click was unique for the user, for clicked URL
  • How can "sent event" have a human/false label? Do we need to remove "sent" and keep open and clicks? After removing NAs it goes away and we are left with seperate Cliks and Open.
  • Join_1 vs 2
In [549]:
# German all bot activities (Netherland data) is 3.7% or ~96% human vs 40% Thrung's 
100*(4690+ 1925)/176000
Out[549]:
3.758522727272727
In [550]:
# Honeypot-based bot activity is 0.5% vs 3.7% (Netherland/German)
100*(449+ 470)/176000
Out[550]:
0.522159090909091
In [551]:
df2= pd.read_csv('DIRECT_INPUT_SFMC_F_ME_ACTION.csv')

print(df2.nunique())
print(df2.columns)
print(df2.info())
print(df2.shape)
df2.isnull().sum()
%Country Mailing Key        238
%Email Activity Key      100104
%User Activity Key        98451
%Country Key                  1
%Mailing Key                238
%MDM Key                  12700
%Calendar Key               548
%Campaign Key                18
%Journey Activity Key       119
ME Event Time             57950
Subscriber_List ID        18152
ME Action Email           13146
ME Event Date             58007
ME Event Type                 5
ME Send URL                1378
ME URL ID                   748
ME Click URL               5652
ME Alias                   2848
Triggered Email Key         130
ME_Is_Unique                  2
ME_Is_Unique_for_URL          2
ME Browser                   13
ME Email Client              15
ME OperatingSystem           15
ME Device                     5
ME Bounce Category            5
ME Bounce Reason            244
ME Unsubscribe Reason         2
EIS_ID                   176876
dtype: int64
Index(['%Country Mailing Key', '%Email Activity Key', '%User Activity Key',
       '%Country Key', '%Mailing Key', '%MDM Key', '%Calendar Key',
       '%Campaign Key', '%Journey Activity Key', 'ME Event Time',
       'Subscriber_List ID', 'ME Action Email', 'ME Event Date',
       'ME Event Type', 'ME Send URL', 'ME URL ID', 'ME Click URL', 'ME Alias',
       'Triggered Email Key', 'ME_Is_Unique', 'ME_Is_Unique_for_URL',
       'ME Browser', 'ME Email Client', 'ME OperatingSystem', 'ME Device',
       'ME Bounce Category', 'ME Bounce Reason', 'ME Unsubscribe Reason',
       'EIS_ID'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176876 entries, 0 to 176875
Data columns (total 29 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   %Country Mailing Key   176876 non-null  object 
 1   %Email Activity Key    176876 non-null  object 
 2   %User Activity Key     176876 non-null  object 
 3   %Country Key           176876 non-null  object 
 4   %Mailing Key           176876 non-null  int64  
 5   %MDM Key               176837 non-null  float64
 6   %Calendar Key          176876 non-null  object 
 7   %Campaign Key          45047 non-null   float64
 8   %Journey Activity Key  176876 non-null  object 
 9   ME Event Time          176767 non-null  object 
 10  Subscriber_List ID     176876 non-null  object 
 11  ME Action Email        176876 non-null  object 
 12  ME Event Date          176876 non-null  object 
 13  ME Event Type          176876 non-null  object 
 14  ME Send URL            16900 non-null   float64
 15  ME URL ID              16900 non-null   float64
 16  ME Click URL           16900 non-null   object 
 17  ME Alias               16900 non-null   object 
 18  Triggered Email Key    52784 non-null   float64
 19  ME_Is_Unique           85115 non-null   object 
 20  ME_Is_Unique_for_URL   16900 non-null   object 
 21  ME Browser             85115 non-null   object 
 22  ME Email Client        85115 non-null   object 
 23  ME OperatingSystem     85115 non-null   object 
 24  ME Device              85115 non-null   object 
 25  ME Bounce Category     4363 non-null    object 
 26  ME Bounce Reason       4363 non-null    object 
 27  ME Unsubscribe Reason  41 non-null      object 
 28  EIS_ID                 176876 non-null  int64  
dtypes: float64(5), int64(2), object(22)
memory usage: 39.1+ MB
None
(176876, 29)
Out[551]:
%Country Mailing Key          0
%Email Activity Key           0
%User Activity Key            0
%Country Key                  0
%Mailing Key                  0
%MDM Key                     39
%Calendar Key                 0
%Campaign Key            131829
%Journey Activity Key         0
ME Event Time               109
Subscriber_List ID            0
ME Action Email               0
ME Event Date                 0
ME Event Type                 0
ME Send URL              159976
ME URL ID                159976
ME Click URL             159976
ME Alias                 159976
Triggered Email Key      124092
ME_Is_Unique              91761
ME_Is_Unique_for_URL     159976
ME Browser                91761
ME Email Client           91761
ME OperatingSystem        91761
ME Device                 91761
ME Bounce Category       172513
ME Bounce Reason         172513
ME Unsubscribe Reason    176835
EIS_ID                        0
dtype: int64
In [ ]:
# import os

# cwd = os.getcwd()  # Get the current working directory (cwd)
# files = os.listdir(cwd)  # Get all the files in that directory
# print("Files in %r: %s" % (cwd, files))
In [ ]:
df1.head(3)
In [ ]:
df2.head(3)

Merging

In [ ]:
# ClickId_ME: Click ID from ME, can be linked to EIS_ID in DIRECT_INPUT_SFMC_F_ME_ACTION (df2)
# SentId: Sent Id of ME, can be linked to ‘%User Activity Key’ in DIRECT_INPUT_SFMC_F_ME_ACTION
In [ ]:
#print(df1.columns)
#print(df2.columns)
print(list(df1))
print(list(df2))
In [ ]:
# remove NAs
In [546]:
### First Join

joined_table1 = pd.merge(df1, df2, how = 'inner', left_on= 'ClickId_ME', right_on= 'EIS_ID')
joined_table1.shape
Out[546]:
(176876, 36)
In [547]:
joined_table1['BOT_ACTIVITY'].value_counts()
Out[547]:
false                        164934
Bot_click_model                4690
Bot_open_model                 1925
Bot_click_Honeypot              470
Bot_click_Honeypot_around       449
Name: BOT_ACTIVITY, dtype: int64
In [548]:
print(joined_table1.nunique())
print(joined_table1.columns)
print(joined_table1.info())
print(joined_table1.shape)
joined_table1.isnull().sum()
EMAIL_TYPE                    1
ClickId_AE                    0
ClickId_ME               176876
SentId                   100104
BOT_ACTIVITY                  5
EIS_ID_x                 176876
NON_BOT                       2
%Country Mailing Key        238
%Email Activity Key      100104
%User Activity Key        98451
%Country Key                  1
%Mailing Key                238
%MDM Key                  12700
%Calendar Key               548
%Campaign Key                18
%Journey Activity Key       119
ME Event Time             57950
Subscriber_List ID        18152
ME Action Email           13146
ME Event Date             58007
ME Event Type                 5
ME Send URL                1378
ME URL ID                   748
ME Click URL               5652
ME Alias                   2848
Triggered Email Key         130
ME_Is_Unique                  2
ME_Is_Unique_for_URL          2
ME Browser                   13
ME Email Client              15
ME OperatingSystem           15
ME Device                     5
ME Bounce Category            5
ME Bounce Reason            244
ME Unsubscribe Reason         2
EIS_ID_y                 176876
dtype: int64
Index(['EMAIL_TYPE', 'ClickId_AE', 'ClickId_ME', 'SentId', 'BOT_ACTIVITY',
       'EIS_ID_x', 'NON_BOT', '%Country Mailing Key', '%Email Activity Key',
       '%User Activity Key', '%Country Key', '%Mailing Key', '%MDM Key',
       '%Calendar Key', '%Campaign Key', '%Journey Activity Key',
       'ME Event Time', 'Subscriber_List ID', 'ME Action Email',
       'ME Event Date', 'ME Event Type', 'ME Send URL', 'ME URL ID',
       'ME Click URL', 'ME Alias', 'Triggered Email Key', 'ME_Is_Unique',
       'ME_Is_Unique_for_URL', 'ME Browser', 'ME Email Client',
       'ME OperatingSystem', 'ME Device', 'ME Bounce Category',
       'ME Bounce Reason', 'ME Unsubscribe Reason', 'EIS_ID_y'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 176876 entries, 0 to 176875
Data columns (total 36 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   EMAIL_TYPE             176876 non-null  object 
 1   ClickId_AE             0 non-null       float64
 2   ClickId_ME             176876 non-null  int64  
 3   SentId                 176876 non-null  object 
 4   BOT_ACTIVITY           172468 non-null  object 
 5   EIS_ID_x               176876 non-null  int64  
 6   NON_BOT                176876 non-null  int64  
 7   %Country Mailing Key   176876 non-null  object 
 8   %Email Activity Key    176876 non-null  object 
 9   %User Activity Key     176876 non-null  object 
 10  %Country Key           176876 non-null  object 
 11  %Mailing Key           176876 non-null  int64  
 12  %MDM Key               176837 non-null  float64
 13  %Calendar Key          176876 non-null  object 
 14  %Campaign Key          45047 non-null   float64
 15  %Journey Activity Key  176876 non-null  object 
 16  ME Event Time          176767 non-null  object 
 17  Subscriber_List ID     176876 non-null  object 
 18  ME Action Email        176876 non-null  object 
 19  ME Event Date          176876 non-null  object 
 20  ME Event Type          176876 non-null  object 
 21  ME Send URL            16900 non-null   float64
 22  ME URL ID              16900 non-null   float64
 23  ME Click URL           16900 non-null   object 
 24  ME Alias               16900 non-null   object 
 25  Triggered Email Key    52784 non-null   float64
 26  ME_Is_Unique           85115 non-null   object 
 27  ME_Is_Unique_for_URL   16900 non-null   object 
 28  ME Browser             85115 non-null   object 
 29  ME Email Client        85115 non-null   object 
 30  ME OperatingSystem     85115 non-null   object 
 31  ME Device              85115 non-null   object 
 32  ME Bounce Category     4363 non-null    object 
 33  ME Bounce Reason       4363 non-null    object 
 34  ME Unsubscribe Reason  41 non-null      object 
 35  EIS_ID_y               176876 non-null  int64  
dtypes: float64(6), int64(5), object(25)
memory usage: 49.9+ MB
None
(176876, 36)
Out[548]:
EMAIL_TYPE                    0
ClickId_AE               176876
ClickId_ME                    0
SentId                        0
BOT_ACTIVITY               4408
EIS_ID_x                      0
NON_BOT                       0
%Country Mailing Key          0
%Email Activity Key           0
%User Activity Key            0
%Country Key                  0
%Mailing Key                  0
%MDM Key                     39
%Calendar Key                 0
%Campaign Key            131829
%Journey Activity Key         0
ME Event Time               109
Subscriber_List ID            0
ME Action Email               0
ME Event Date                 0
ME Event Type                 0
ME Send URL              159976
ME URL ID                159976
ME Click URL             159976
ME Alias                 159976
Triggered Email Key      124092
ME_Is_Unique              91761
ME_Is_Unique_for_URL     159976
ME Browser                91761
ME Email Client           91761
ME OperatingSystem        91761
ME Device                 91761
ME Bounce Category       172513
ME Bounce Reason         172513
ME Unsubscribe Reason    176835
EIS_ID_y                      0
dtype: int64
In [ ]:
# #joined_table1[joined_table1['ME Click URL'].notna()]
# joined_table1['ME Click URL'].dropna()

Are two primary keys the same? No

In [552]:
joined_table1_test = pd.merge(df1, df2, how = 'inner', on = ['EIS_ID', 'EIS_ID'])
joined_table1_test
Out[552]:
EMAIL_TYPE ClickId_AE ClickId_ME SentId BOT_ACTIVITY EIS_ID NON_BOT %Country Mailing Key %Email Activity Key %User Activity Key ... Triggered Email Key ME_Is_Unique ME_Is_Unique_for_URL ME Browser ME Email Client ME OperatingSystem ME Device ME Bounce Category ME Bounce Reason ME Unsubscribe Reason

0 rows × 35 columns

In [553]:
## Remove a1t.... from SentId column
df1_updated=df1[~df1['SentId'].str.startswith('a1t')]
print(df1_updated.head(3))
print(df1_updated.shape)
  EMAIL_TYPE  ClickId_AE  ClickId_ME                        SentId  \
0         ME         NaN     3019295  NL|1312042|107245|2020-03-16   
1         ME         NaN     2997907  NL|1312042|107596|2020-03-16   
2         ME         NaN     2997909  NL|1312042|107596|2020-03-16   

  BOT_ACTIVITY   EIS_ID  NON_BOT  
0        false  1234400        1  
1        false  1234401        1  
2        false  1234402        1  
(176876, 7)
In [554]:
## Remove the second part of sentId
df1_updated['SentId_updated']=df1_updated['SentId'].str.split('|').apply(lambda x: '|'.join([x[0], x[2], x[3]]))
In [555]:
#### Second Join
joined_table2 = pd.merge(df1_updated, df2, how = 'inner', left_on= 'SentId_updated', right_on= '%User Activity Key')
joined_table2.shape
Out[555]:
(1745426, 37)
In [ ]:
joined_table2.head(3)
In [ ]:
print(joined_table2.nunique())
print(joined_table2.columns)
print(joined_table2.info())
print(joined_table2.shape)
joined_table2.isnull().sum()
In [ ]:
joined_table2['BOT_ACTIVITY'].value_counts()
In [ ]:
joined_table1.head()
In [ ]:
joined_table2['BOT_ACTIVITY'].value_counts()
In [ ]:
joined_table2['%Journey Activity Key'].value_counts() 
In [ ]:
joined_table2.head(3)
In [ ]:
pwd
In [556]:
PingPU=joined_table1.groupby(['ME Action Email']).size().sort_values(ascending=False)
PingPU.head(30)
Out[556]:
ME Action Email
mdkanter@tsz.nl                         751
m.wijnands@etz.nl                       704
jretera@tsz.nl                          561
a.pronk@elisabeth.nl                    526
w.hermans@elisabeth.nl                  456
t.duijnhouwer@gmail.com                 445
jwiddershoven@tsz.nl                    441
jongw@zgv.nl                            368
khamraoui@tsz.nl                        367
mwinkens@tsz.nl                         361
mozmen@tsz.nl                           270
stolkj@zgv.nl                           247
d.nieuwenhuijs@antoniusziekenhuis.nl    231
molp@zgv.nl                             223
sjhagru@wzh.nl                          220
p.reiss@amc.uva.nl                      206
e.nossent@amsterdamumc.nl               203
frontofficecmc@parnassiagroep.nl        200
meijers.mazure@ezorg.nl                 183
m.roovers@elisabeth.nl                  182
kruijfe@zgv.nl                          181
p.rensma@elisabeth.nl                   180
veenma@zgv.nl                           173
m.vonk@reuma.umcn.nl                    171
oudk@zgv.nl                             167
secretariaat.reuma@radboudumc.nl        160
ruiterd@zgv.nl                          157
f.apperloo@elisabeth.nl                 156
l.maussen@cliniccareservices.nl         151
loenhoutt@zgv.nl                        151
dtype: int64
In [557]:
PingPU=joined_table1.groupby(['ME Event Time']).size().sort_values(ascending=False)
PingPU.head(10)
Out[557]:
ME Event Time
11/04/2020 07:00:43    1417
03/28/2021 10:00:51    1355
03/14/2021 16:00:45    1281
06/23/2020 09:26:57    1179
06/23/2020 09:26:54    1142
11/04/2020 07:00:52    1129
06/23/2020 09:26:55    1125
11/04/2020 07:00:51    1123
06/23/2020 09:26:51    1121
04/11/2021 10:01:37    1097
dtype: int64
In [ ]:
PingPU=joined_table1.groupby(['ME Event Date']).size().sort_values(ascending=False)
PingPU.head(10)
In [558]:
joined_table1['ME Event Date'] = pd.to_datetime(joined_table1['ME Event Date'])
joined_table1['ME Event Time'] = pd.to_datetime(joined_table1['ME Event Time'])
joined_table1['%Calendar Key'] = pd.to_datetime(joined_table1['%Calendar Key'])
In [ ]:
print(joined_table1.info())
In [ ]:
#type(joined_table1['ME Event Date'])
In [ ]:
joined_table1.groupby(pd.Grouper(key='ME Event Time',freq='M')).size().sort_values(ascending=False)#.nunique()ascending=True
In [ ]:
joined_table1.groupby(pd.Grouper(key='ME Event Date',freq='M')).size().sort_values(ascending=False)
In [ ]:
joined_table1.groupby(pd.Grouper(key='%Calendar Key',freq='M')).size().sort_values(ascending=False) 

Finalize joined_1 table

In [ ]:
# # Useless columns
# EMAIL_TYPE, ClickId_AE, ClickId_ME, SentId, EIS_ID_x, NON_BOT, SentId_updated, %Country Mailing Key, %Email Activity Key, %User Activity Key, %Country Key, %Mailing Key, %MDM Key
# %Campaign Key, Subscriber_List ID
# Triggered Email Key
# ME Bounce Reason, ME Unsubscribe Reason, EIS_ID_y
In [559]:
Final_joined_table1 = joined_table1.drop(['EMAIL_TYPE', 'ClickId_AE', 'ClickId_ME', 'EIS_ID_x', 'NON_BOT', '%Country Mailing Key', '%User Activity Key', '%Country Key', '%Mailing Key', '%MDM Key',
'%Campaign Key', 'Subscriber_List ID',
'Triggered Email Key',
'ME Bounce Reason', 'ME Unsubscribe Reason', 'EIS_ID_y', '%Journey Activity Key', 'ME Bounce Category'], axis='columns')# 'SentId', '%Email Activity Key'
Final_joined_table1.head(3)
Out[559]:
SentId BOT_ACTIVITY %Email Activity Key %Calendar Key ME Event Time ME Action Email ME Event Date ME Event Type ME Send URL ME URL ID ME Click URL ME Alias ME_Is_Unique ME_Is_Unique_for_URL ME Browser ME Email Client ME OperatingSystem ME Device
0 NL|1312042|107245|2020-03-16 false NL|1312042|107245|2020-03-16 2020-03-16 2020-03-16 13:00:53 apotheekoldebroek@ezorg.nl 2020-03-16 13:00:53 Sent NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 NL|1312042|107596|2020-03-16 false NL|1312042|107596|2020-03-16 2020-03-16 2020-03-16 13:00:53 brodie@xs4all.nl 2020-03-16 13:00:53 Sent NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 NL|1312042|107596|2020-03-16 false NL|1312042|107596|2020-03-16 2020-03-16 2020-03-16 13:06:10 brodie@xs4all.nl 2020-03-16 13:06:10 Open NaN NaN NaN NaN True NaN Unspecified Apple Mail 1 Mac OS X 10.1 Macintosh
In [560]:
print(Final_joined_table1.nunique())
print(Final_joined_table1.columns)
print(Final_joined_table1.info())
print(Final_joined_table1.shape)
Final_joined_table1.isnull().sum()
SentId                  100104
BOT_ACTIVITY                 5
%Email Activity Key     100104
%Calendar Key              548
ME Event Time            57950
ME Action Email          13146
ME Event Date            58007
ME Event Type                5
ME Send URL               1378
ME URL ID                  748
ME Click URL              5652
ME Alias                  2848
ME_Is_Unique                 2
ME_Is_Unique_for_URL         2
ME Browser                  13
ME Email Client             15
ME OperatingSystem          15
ME Device                    5
dtype: int64
Index(['SentId', 'BOT_ACTIVITY', '%Email Activity Key', '%Calendar Key',
       'ME Event Time', 'ME Action Email', 'ME Event Date', 'ME Event Type',
       'ME Send URL', 'ME URL ID', 'ME Click URL', 'ME Alias', 'ME_Is_Unique',
       'ME_Is_Unique_for_URL', 'ME Browser', 'ME Email Client',
       'ME OperatingSystem', 'ME Device'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 176876 entries, 0 to 176875
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   SentId                176876 non-null  object        
 1   BOT_ACTIVITY          172468 non-null  object        
 2   %Email Activity Key   176876 non-null  object        
 3   %Calendar Key         176876 non-null  datetime64[ns]
 4   ME Event Time         176767 non-null  datetime64[ns]
 5   ME Action Email       176876 non-null  object        
 6   ME Event Date         176876 non-null  datetime64[ns]
 7   ME Event Type         176876 non-null  object        
 8   ME Send URL           16900 non-null   float64       
 9   ME URL ID             16900 non-null   float64       
 10  ME Click URL          16900 non-null   object        
 11  ME Alias              16900 non-null   object        
 12  ME_Is_Unique          85115 non-null   object        
 13  ME_Is_Unique_for_URL  16900 non-null   object        
 14  ME Browser            85115 non-null   object        
 15  ME Email Client       85115 non-null   object        
 16  ME OperatingSystem    85115 non-null   object        
 17  ME Device             85115 non-null   object        
dtypes: datetime64[ns](3), float64(2), object(13)
memory usage: 25.6+ MB
None
(176876, 18)
Out[560]:
SentId                       0
BOT_ACTIVITY              4408
%Email Activity Key          0
%Calendar Key                0
ME Event Time              109
ME Action Email              0
ME Event Date                0
ME Event Type                0
ME Send URL             159976
ME URL ID               159976
ME Click URL            159976
ME Alias                159976
ME_Is_Unique             91761
ME_Is_Unique_for_URL    159976
ME Browser               91761
ME Email Client          91761
ME OperatingSystem       91761
ME Device                91761
dtype: int64

Remove NA s in impt vars

In [561]:
Final_joined_table1_NA = Final_joined_table1.dropna(subset= ['ME Browser', 'ME Email Client','ME OperatingSystem', 'ME Device'])
In [562]:
print(Final_joined_table1_NA.nunique())
print(Final_joined_table1_NA.columns)
print(Final_joined_table1_NA.info())
print(Final_joined_table1_NA.shape)
print(Final_joined_table1_NA.isnull().sum())
print(Final_joined_table1_NA['BOT_ACTIVITY'].value_counts())
SentId                  33374
BOT_ACTIVITY                5
%Email Activity Key     33374
%Calendar Key             545
ME Event Time           57108
ME Action Email          7507
ME Event Date           57192
ME Event Type               2
ME Send URL              1378
ME URL ID                 748
ME Click URL             5652
ME Alias                 2848
ME_Is_Unique                2
ME_Is_Unique_for_URL        2
ME Browser                 13
ME Email Client            15
ME OperatingSystem         15
ME Device                   5
dtype: int64
Index(['SentId', 'BOT_ACTIVITY', '%Email Activity Key', '%Calendar Key',
       'ME Event Time', 'ME Action Email', 'ME Event Date', 'ME Event Type',
       'ME Send URL', 'ME URL ID', 'ME Click URL', 'ME Alias', 'ME_Is_Unique',
       'ME_Is_Unique_for_URL', 'ME Browser', 'ME Email Client',
       'ME OperatingSystem', 'ME Device'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
Int64Index: 85115 entries, 2 to 176875
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   SentId                85115 non-null  object        
 1   BOT_ACTIVITY          85115 non-null  object        
 2   %Email Activity Key   85115 non-null  object        
 3   %Calendar Key         85115 non-null  datetime64[ns]
 4   ME Event Time         85006 non-null  datetime64[ns]
 5   ME Action Email       85115 non-null  object        
 6   ME Event Date         85115 non-null  datetime64[ns]
 7   ME Event Type         85115 non-null  object        
 8   ME Send URL           16900 non-null  float64       
 9   ME URL ID             16900 non-null  float64       
 10  ME Click URL          16900 non-null  object        
 11  ME Alias              16900 non-null  object        
 12  ME_Is_Unique          85115 non-null  object        
 13  ME_Is_Unique_for_URL  16900 non-null  object        
 14  ME Browser            85115 non-null  object        
 15  ME Email Client       85115 non-null  object        
 16  ME OperatingSystem    85115 non-null  object        
 17  ME Device             85115 non-null  object        
dtypes: datetime64[ns](3), float64(2), object(13)
memory usage: 12.3+ MB
None
(85115, 18)
SentId                      0
BOT_ACTIVITY                0
%Email Activity Key         0
%Calendar Key               0
ME Event Time             109
ME Action Email             0
ME Event Date               0
ME Event Type               0
ME Send URL             68215
ME URL ID               68215
ME Click URL            68215
ME Alias                68215
ME_Is_Unique                0
ME_Is_Unique_for_URL    68215
ME Browser                  0
ME Email Client             0
ME OperatingSystem          0
ME Device                   0
dtype: int64
false                        77581
Bot_click_model               4690
Bot_open_model                1925
Bot_click_Honeypot             470
Bot_click_Honeypot_around      449
Name: BOT_ACTIVITY, dtype: int64
In [572]:
85115-68215
Out[572]:
16900

Diff between event date and time

In [ ]:
Final_joined_table1_NA['ME Event Time dif'] = Final_joined_table1_NA['ME Event Date'] - Final_joined_table1_NA['ME Event Time']
Final_joined_table1_NA['ME Event Time dif'] = Final_joined_table1_NA['ME Event Time dif']/np.timedelta64(1,'s')
Final_joined_table1_NA.head()
In [ ]:
Final_joined_table1_NA['ME Event Time dif'].describe()

So, we remove "Me event time"

In [ ]:
Final_joined_table1_NA = Final_joined_table1_NA.drop(columns= ['ME Event Time dif', 'ME Event Time', '%Calendar Key'])
Final_joined_table1_NA.head(3)
In [563]:
print(Final_joined_table1_NA['BOT_ACTIVITY'].value_counts())
print(Final_joined_table1_NA['ME Event Type'].value_counts())
print(Final_joined_table1_NA.isnull().sum())
false                        77581
Bot_click_model               4690
Bot_open_model                1925
Bot_click_Honeypot             470
Bot_click_Honeypot_around      449
Name: BOT_ACTIVITY, dtype: int64
Open     68215
Click    16900
Name: ME Event Type, dtype: int64
SentId                      0
BOT_ACTIVITY                0
%Email Activity Key         0
%Calendar Key               0
ME Event Time             109
ME Action Email             0
ME Event Date               0
ME Event Type               0
ME Send URL             68215
ME URL ID               68215
ME Click URL            68215
ME Alias                68215
ME_Is_Unique                0
ME_Is_Unique_for_URL    68215
ME Browser                  0
ME Email Client             0
ME OperatingSystem          0
ME Device                   0
dtype: int64

What is the difference between honeypot and honeypot_around? Are they duplicates?

In [564]:
## Detect duplicate for honeypot around
Final_joined_table1_NA_test = Final_joined_table1_NA.drop(['BOT_ACTIVITY'], axis='columns')
duplicate1 = Final_joined_table1_NA_test[Final_joined_table1_NA_test.duplicated()]

#print(duplicate1)
duplicate1.shape
Out[564]:
(6858, 17)
In [565]:
Final_joined_table1_NA.shape
Out[565]:
(85115, 18)
In [ ]:
duplicate1 = Final_joined_table1_NA[Final_joined_table1_NA.duplicated()]
duplicate1.head(5)
duplicate1.shape
In [ ]:
Final_joined_table1_NA_honey = Final_joined_table1_NA.loc[Final_joined_table1_NA['BOT_ACTIVITY'].astype(str).isin(['Bot_click_Honeypot', 'Bot_click_Honeypot_around'])]
Final_joined_table1_NA_honey.head(3)
#Final_joined_table1_NA_honey.shape
In [ ]:
duplicate = Final_joined_table1_NA_honey[Final_joined_table1_NA_honey.duplicated()]
duplicate.shape
In [ ]:
Final_joined_table1_NA_honey_test = Final_joined_table1_NA_honey.drop(['BOT_ACTIVITY'], axis='columns')
duplicate = Final_joined_table1_NA_honey_test[Final_joined_table1_NA_honey_test.duplicated()]
duplicate.shape
In [ ]:
Final_joined_table1_NA_honey_test.head()
In [ ]:
#Groupby time and link 
In [ ]:
 
In [ ]:
Final_joined_table1_NA = Final_joined_table1_NA.drop(['ME Send URL','ME URL ID','ME Click URL','ME Alias','ME_Is_Unique_for_URL'], axis='columns')
Final_joined_table1_NA.head(3)
print(Final_joined_table1_NA.isnull().sum())
In [ ]:
Final_joined_table1_NA_test= Final_joined_table1_NA[['ME Event Date', 'BOT_ACTIVITY'
]]#'ME Email Client',
# 'ME OperatingSystem',
# 'ME Device'

Sent table to be joined to get the sent time (sent to open, sent to click)

How we join Sent time to Open_Click table?

  • Just Open and Click have records for categorical vars including isunique which makes sence!
  • "%Email Activity Key" from df2 is "SentId" from df1 are same?! "SentId" & "%Email Activity Key" have 100104 unique (both df1, df2 176k)
In [ ]:
Final_joined_table1['ME Event Type'].value_counts()

Not all sent emails get oppened/clicked (87k > 85k)

In [566]:
#Just Open and Sent have reords for "categorical" vars including "isunique"
68215 + 16900 #+ 4363 + 45
Out[566]:
85115

Sent, Open, Click tables

In [ ]:
Final_joined_table1_Sent = Final_joined_table1.loc[Final_joined_table1['ME Event Type'].astype(str) == 'Sent'] #from 176k
Final_joined_table1_Open = Final_joined_table1_NA.loc[Final_joined_table1_NA['ME Event Type'].astype(str) == 'Open'] #from 85k no NAs no Sent
Final_joined_table1_Click = Final_joined_table1_NA.loc[Final_joined_table1_NA['ME Event Type'].astype(str) == 'Click'] #from 85k no NAs no Sent
In [ ]:
 
In [ ]:
print(Final_joined_table1_Sent.shape)
print(Final_joined_table1_Open.shape)
print(Final_joined_table1_Click.shape)

The "sentId"s are less than tables due to multiple clicks? Duplicates?

In [ ]:
print(Final_joined_table1_Sent.isnull().sum())
print(Final_joined_table1_Open.isnull().sum())
print(Final_joined_table1_Click.isnull().sum())
print(Final_joined_table1_Sent.nunique())  # unique SentId                 87258    no NAs
print(Final_joined_table1_Open.nunique())  #        SentId                 33374    no NAs
print(Final_joined_table1_Click.nunique()) #       SentId                 5621     no NAs
print(Final_joined_table1_Sent.shape)      # (87353, 18)
print(Final_joined_table1_Open.shape)      #(68215, 11)
print(Final_joined_table1_Click.shape)     #(16900, 11)
In [ ]:
Final_joined_table1.shape
In [ ]:
 
In [ ]:
Final_joined_table1_NA.shape
In [ ]:
Final_joined_table1_Click.head(3)
In [ ]:
Final_joined_table1_NA.head(3)
In [ ]:
Final_joined_table1_Sent.isnull().sum()

Merging "sent", "open", "click" on SentId with inner join?

Join Click & Open

In [567]:
joined_table_Open_Click = pd.merge(Final_joined_table1_Click, Final_joined_table1_Open, how = 'inner', on= ['SentId'])# , '%Email Activity Key'
joined_table_Open_Click.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-567-3210315a7cf7> in <module>
----> 1 joined_table_Open_Click = pd.merge(Final_joined_table1_Click, Final_joined_table1_Open, how = 'inner', on= ['SentId'])# , '%Email Activity Key'
      2 joined_table_Open_Click.shape

NameError: name 'Final_joined_table1_Click' is not defined
In [568]:
joined_table_Sent_Click = pd.merge(Final_joined_table1_Sent, Final_joined_table1_Click, how = 'left', on= ['SentId'])# , '%Email Activity Key'
joined_table_Sent_Click.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-568-e4099d9ff261> in <module>
----> 1 joined_table_Sent_Click = pd.merge(Final_joined_table1_Sent, Final_joined_table1_Click, how = 'left', on= ['SentId'])# , '%Email Activity Key'
      2 joined_table_Sent_Click.shape

NameError: name 'Final_joined_table1_Sent' is not defined
In [569]:
joined_table_Sent_Click_Open = pd.merge(joined_table_Sent_Click, Final_joined_table1_Open, how = 'left', on= ['SentId'])# , '%Email Activity Key'
joined_table_Sent_Click_Open.shape
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-569-a4ed64377d67> in <module>
----> 1 joined_table_Sent_Click_Open = pd.merge(joined_table_Sent_Click, Final_joined_table1_Open, how = 'left', on= ['SentId'])# , '%Email Activity Key'
      2 joined_table_Sent_Click_Open.shape

NameError: name 'joined_table_Sent_Click' is not defined
In [ ]:
joined_table_Sent_Click_Open.head(5)
In [ ]:
# joined_table_Sent_Click_Open.dropna(subset= ['BOT_ACTIVITY_x', 'BOT_ACTIVITY_y', 'BOT_ACTIVITY'])
In [ ]:
joined_table_Sent_Click_Open.shape
In [ ]:
 

Join Click_Open & Sent

In [ ]:
joined_table_Open_Click_Sent = pd.merge(joined_table_Open_Click, Final_joined_table1_Sent, how = 'inner', on= ['SentId'])# , '%Email Activity Key'
joined_table_Open_Click_Sent.shape
In [ ]:
joined_table_Open_Click_Sent.head(4)
In [ ]:
joined_table_Open_Click_Sent.columns

Keep the relevant columns in the final table

Check if the similar column names (_x _y) are the same
In [ ]:
 
In [ ]:
#### Keep the relevant columns
Final_joined_table_Open_Click_Sent= joined_table_Open_Click_Sent[['SentId', 'BOT_ACTIVITY_x', 'BOT_ACTIVITY_y', 'BOT_ACTIVITY',
       'ME Action Email_x', 'ME Event Date_x', 'ME Event Type_x',
       'ME Event Date_y', 'ME Event Type_y', 
       'ME Event Date', 'ME Event Type', 'ME_Is_Unique_x', 'ME Browser_x', 'ME Email Client_x',
       'ME OperatingSystem_x', 'ME Device_x']] #SentId	BOT_ACTIVITY_x, ME Action Email, ME Event Date_x ME Event Type_x, ME_Is_Unique_x, ME Browser_x	ME Email Client_x	ME OperatingSystem_x	ME Device_x	BOT_ACTIVITY_y ME Event Date_y	ME Event Type_y BOT_ACTIVITY, ME Event Date	ME Event Type

Final table

In [ ]:
Final_joined_table_Open_Click_Sent.head(10)
In [ ]:
print(Final_joined_table_Open_Click_Sent.nunique())
print(Final_joined_table_Open_Click_Sent.columns)
print(Final_joined_table_Open_Click_Sent.info())
print(Final_joined_table_Open_Click_Sent.shape)
print(Final_joined_table_Open_Click_Sent.isnull().sum())
print(Final_joined_table_Open_Click_Sent['BOT_ACTIVITY'].value_counts())
print(Final_joined_table_Open_Click_Sent['BOT_ACTIVITY_x'].value_counts())
print(Final_joined_table_Open_Click_Sent['BOT_ACTIVITY_y'].value_counts())
In [ ]:
 
In [ ]:
# print(Final_joined_table1_NA.isnull().sum())
In [ ]:
# Final_joined_table_Open_Click_Sent= joined_table_Open_Click_Sent.drop(['%Email Activity Key_x', 'ME Send URL','ME URL ID','ME Click URL','ME Alias','ME_Is_Unique_for_URL', 'ME_Is_Unique', 'ME Browser', '%Calendar Key', 'ME Event Time','ME Action Email','%Email Activity Key', 'ME Email Client', 'ME OperatingSystem', 'ME Device', ], axis='columns')
In [ ]:
# joined_table_sent_open_click = pd.merge(Final_joined_table1_NA, Final_joined_table1_sent, how = 'inner', on= ['SentId', '%Email Activity Key'])# , '%Email Activity Key'
In [ ]:
# joined_table_sent_open_click = pd.merge(Final_joined_table1, Final_joined_table1_sent, how = 'selfjoin', on= ['BOT_ACTIVITY', 'ME Action Email', 'ME Event Date',
#        'ME_Is_Unique', 'ME Browser', 'ME Email Client', 'ME OperatingSystem',
#        'ME Device']) 
In [ ]:
# joined_table_sent_open_click.shape
In [ ]:
# joined_table_sent_open_click.head(3)
In [ ]:
# print(joined_table_sent_open_click.isnull().sum())
# joined_table_sent_open_click_NA = joined_table_sent_open_click.dropna()
In [ ]:
# print(joined_table_sent_open_click_NA.isnull().sum())

Merge BOT_ACTIVITY labels 5-->2

In [ ]:
joined_table_sent_open_click_NA.shape
In [ ]:
 

Time Feature engineering

In [ ]:
# change values to datetime, create new columns for dif and change to seconds
# df['click_date'] = pd.to_datetime(df['click_date'])
# df['open_date'] = pd.to_datetime(df['open_date'])
# df['sent_date'] = pd.to_datetime(df['sent_date'])

Final_joined_table_Open_Click_Sent['open_to_click'] = Final_joined_table_Open_Click_Sent['ME Event Date_x'] - Final_joined_table_Open_Click_Sent['ME Event Date_y']
Final_joined_table_Open_Click_Sent['sent_to_click'] = Final_joined_table_Open_Click_Sent['ME Event Date_x'] - Final_joined_table_Open_Click_Sent['ME Event Date']
Final_joined_table_Open_Click_Sent['sent_to_open'] = Final_joined_table_Open_Click_Sent['ME Event Date_y'] - Final_joined_table_Open_Click_Sent['ME Event Date']

Final_joined_table_Open_Click_Sent['open_to_click'] = Final_joined_table_Open_Click_Sent['open_to_click']/np.timedelta64(1,'s')
Final_joined_table_Open_Click_Sent['sent_to_click'] = Final_joined_table_Open_Click_Sent['sent_to_click']/np.timedelta64(1,'s')
Final_joined_table_Open_Click_Sent['sent_to_open'] = Final_joined_table_Open_Click_Sent['sent_to_open']/np.timedelta64(1,'s')
In [ ]:
 
In [ ]:
Final_joined_table_Open_Click_Sent.head(2)
In [ ]:
Final_joined_table_Open_Click_Sent_HP = Final_joined_table_Open_Click_Sent.loc[Final_joined_table_Open_Click_Sent['ME Event Date_x'].astype(str) > '2021-03-31 13:00:56']
In [ ]:
Final_joined_table_Open_Click_Sent_HP.shape
In [ ]:
print(Final_joined_table_Open_Click_Sent_HP['BOT_ACTIVITY_x'].value_counts()) 
In [ ]:
100*(36898+24520)/215824
In [531]:
plt.figure(figsize=(8,6))
plt.hist(Final_joined_table_Open_Click_Sent['sent_to_open'], bins=30, alpha=0.55, label="sent_to_open", color='purple')
plt.hist(Final_joined_table_Open_Click_Sent['sent_to_click'], bins=90, alpha=0.99, label="sent_to_click", color='black')
plt.hist(Final_joined_table_Open_Click_Sent['open_to_click'], bins=90, alpha=0.5, label="open_to_click", color='yellow')

plt.xlabel("time", size=14)
plt.ylabel("Count", size=14)
plt.title("Open vs click")
plt.legend(loc='upper right')
plt.savefig("overlapping_histograms_with_matplotlib_Python_2.png")
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-531-8f2193f799ab> in <module>
      1 plt.figure(figsize=(8,6))
----> 2 plt.hist(Final_joined_table_Open_Click_Sent['sent_to_open'], bins=30, alpha=0.55, label="sent_to_open", color='purple')
      3 plt.hist(Final_joined_table_Open_Click_Sent['sent_to_click'], bins=90, alpha=0.99, label="sent_to_click", color='black')
      4 plt.hist(Final_joined_table_Open_Click_Sent['open_to_click'], bins=90, alpha=0.5, label="open_to_click", color='yellow')
      5 

NameError: name 'Final_joined_table_Open_Click_Sent' is not defined
<Figure size 576x432 with 0 Axes>
In [532]:
#scatterplot
sns.set()
cols = ['BOT_ACTIVITY', 'ME Event Date_x', 
       'ME Event Date_y', 'ME Event Date',
        'ME Browser_x', 'ME Email Client_x',
       'ME OperatingSystem_x', 'ME Device_x', 'open_to_click', 'sent_to_click',
       'sent_to_open']#'BOT_ACTIVITY_x', 'BOT_ACTIVITY_y', 'ME_Is_Unique_x'
sns.pairplot(Final_joined_table_Open_Click_Sent[cols], size = 2.5)
plt.show();
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-532-1b105b6ad67b> in <module>
      6        'ME OperatingSystem_x', 'ME Device_x', 'open_to_click', 'sent_to_click',
      7        'sent_to_open']#'BOT_ACTIVITY_x', 'BOT_ACTIVITY_y', 'ME_Is_Unique_x'
----> 8 sns.pairplot(Final_joined_table_Open_Click_Sent[cols], size = 2.5)
      9 plt.show();

NameError: name 'Final_joined_table_Open_Click_Sent' is not defined
In [ ]:
cols
In [ ]:
 
In [ ]:
 
In [ ]:
 

Anamoly detection - Pycaret

In [ ]:
# # Importing module and initializing setup
# from pycaret.anomaly import *
# ano1 = setup(data = Final_joined_table1_NA_test)
# # creating a model
# iforest = create_model('iforest')
# # plotting a model
# plot_model(iforest)

Honeypot link

In [ ]:
# The HP link can be found in the URL that contains “https://www.msd.nl/?utm_term=bot_activity”
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 

New join (Chen)

In [91]:
df['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[91]:
false                        21037
Bot_click_model                929
Bot_open_model                 649
Bot_click_Honeypot             470
Bot_click_Honeypot_around      449
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [92]:
df = pd.read_csv('Netherland_email_for_anomaly_detection_since_March_31_2021_with_23534_37.csv')
print(df.nunique())
print(df.columns)
print(df.info())
print(df.shape)
df.isnull().sum()
Unnamed: 0                                  21593
SentId                                       8525
ME Action Email                              3716
ME Event Type_FROM_SENT                         1
ME Event Type_FROM_CLICK_OR_OPEN                2
ME Event Date_FROM_SENT                       174
ME Event Date_FROM_CLICK_OR_OPEN            15234
BOT_ACTIVITY_FROM_SENT                          1
BOT_ACTIVITY_FROM_CLICK_OR_OPEN                 5
Triggered Email Key_FROM_SENT                  65
%Country Mailing Key_FROM_CLICK_OR_OPEN       115
%Email Activity Key_FROM_CLICK_OR_OPEN       8525
%User Activity Key_FROM_CLICK_OR_OPEN        8495
%Mailing Key_FROM_CLICK_OR_OPEN               115
%MDM Key_FROM_CLICK_OR_OPEN                  3703
%Calendar Key_FROM_CLICK_OR_OPEN               46
%Journey Activity Key_FROM_CLICK_OR_OPEN       66
Subscriber_List ID_FROM_CLICK_OR_OPEN        3761
ME Send URL_FROM_CLICK_OR_OPEN                757
ME URL ID_FROM_CLICK_OR_OPEN                  507
ME Click URL_FROM_CLICK_OR_OPEN              2097
ME Alias_FROM_CLICK_OR_OPEN                  1169
Triggered Email Key_FROM_CLICK_OR_OPEN         65
ME_Is_Unique_FROM_CLICK_OR_OPEN                 2
ME_Is_Unique_for_URL_FROM_CLICK_OR_OPEN         2
ME Browser_FROM_CLICK_OR_OPEN                   9
ME Email Client_FROM_CLICK_OR_OPEN             12
ME OperatingSystem_FROM_CLICK_OR_OPEN          12
ME Device_FROM_CLICK_OR_OPEN                    5
%Country Mailing Key_FROM_SENT                115
%Email Activity Key_FROM_SENT                8525
%User Activity Key_FROM_SENT                 8495
%Mailing Key_FROM_SENT                        115
%MDM Key_FROM_SENT                           3703
%Calendar Key_FROM_SENT                        46
%Campaign Key_FROM_SENT                        14
%Journey Activity Key_FROM_SENT                66
Subscriber_List ID_FROM_SENT                 3761
dtype: int64
Index(['Unnamed: 0', 'SentId', 'ME Action Email', 'ME Event Type_FROM_SENT',
       'ME Event Type_FROM_CLICK_OR_OPEN', 'ME Event Date_FROM_SENT',
       'ME Event Date_FROM_CLICK_OR_OPEN', 'BOT_ACTIVITY_FROM_SENT',
       'BOT_ACTIVITY_FROM_CLICK_OR_OPEN', 'Triggered Email Key_FROM_SENT',
       '%Country Mailing Key_FROM_CLICK_OR_OPEN',
       '%Email Activity Key_FROM_CLICK_OR_OPEN',
       '%User Activity Key_FROM_CLICK_OR_OPEN',
       '%Mailing Key_FROM_CLICK_OR_OPEN', '%MDM Key_FROM_CLICK_OR_OPEN',
       '%Calendar Key_FROM_CLICK_OR_OPEN',
       '%Journey Activity Key_FROM_CLICK_OR_OPEN',
       'Subscriber_List ID_FROM_CLICK_OR_OPEN',
       'ME Send URL_FROM_CLICK_OR_OPEN', 'ME URL ID_FROM_CLICK_OR_OPEN',
       'ME Click URL_FROM_CLICK_OR_OPEN', 'ME Alias_FROM_CLICK_OR_OPEN',
       'Triggered Email Key_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_for_URL_FROM_CLICK_OR_OPEN',
       'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
       '%Country Mailing Key_FROM_SENT', '%Email Activity Key_FROM_SENT',
       '%User Activity Key_FROM_SENT', '%Mailing Key_FROM_SENT',
       '%MDM Key_FROM_SENT', '%Calendar Key_FROM_SENT',
       '%Campaign Key_FROM_SENT', '%Journey Activity Key_FROM_SENT',
       'Subscriber_List ID_FROM_SENT'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23534 entries, 0 to 23533
Data columns (total 38 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Unnamed: 0                                23534 non-null  int64  
 1   SentId                                    23534 non-null  object 
 2   ME Action Email                           23534 non-null  object 
 3   ME Event Type_FROM_SENT                   23534 non-null  object 
 4   ME Event Type_FROM_CLICK_OR_OPEN          23534 non-null  object 
 5   ME Event Date_FROM_SENT                   23534 non-null  object 
 6   ME Event Date_FROM_CLICK_OR_OPEN          23534 non-null  object 
 7   BOT_ACTIVITY_FROM_SENT                    23534 non-null  bool   
 8   BOT_ACTIVITY_FROM_CLICK_OR_OPEN           23534 non-null  object 
 9   Triggered Email Key_FROM_SENT             11126 non-null  float64
 10  %Country Mailing Key_FROM_CLICK_OR_OPEN   23534 non-null  object 
 11  %Email Activity Key_FROM_CLICK_OR_OPEN    23534 non-null  object 
 12  %User Activity Key_FROM_CLICK_OR_OPEN     23534 non-null  object 
 13  %Mailing Key_FROM_CLICK_OR_OPEN           23534 non-null  float64
 14  %MDM Key_FROM_CLICK_OR_OPEN               23526 non-null  float64
 15  %Calendar Key_FROM_CLICK_OR_OPEN          23534 non-null  object 
 16  %Journey Activity Key_FROM_CLICK_OR_OPEN  23534 non-null  object 
 17  Subscriber_List ID_FROM_CLICK_OR_OPEN     23534 non-null  object 
 18  ME Send URL_FROM_CLICK_OR_OPEN            6181 non-null   float64
 19  ME URL ID_FROM_CLICK_OR_OPEN              6181 non-null   float64
 20  ME Click URL_FROM_CLICK_OR_OPEN           6181 non-null   object 
 21  ME Alias_FROM_CLICK_OR_OPEN               6181 non-null   object 
 22  Triggered Email Key_FROM_CLICK_OR_OPEN    11126 non-null  float64
 23  ME_Is_Unique_FROM_CLICK_OR_OPEN           23534 non-null  bool   
 24  ME_Is_Unique_for_URL_FROM_CLICK_OR_OPEN   6181 non-null   object 
 25  ME Browser_FROM_CLICK_OR_OPEN             23534 non-null  object 
 26  ME Email Client_FROM_CLICK_OR_OPEN        23534 non-null  object 
 27  ME OperatingSystem_FROM_CLICK_OR_OPEN     23534 non-null  object 
 28  ME Device_FROM_CLICK_OR_OPEN              23534 non-null  object 
 29  %Country Mailing Key_FROM_SENT            23534 non-null  object 
 30  %Email Activity Key_FROM_SENT             23534 non-null  object 
 31  %User Activity Key_FROM_SENT              23534 non-null  object 
 32  %Mailing Key_FROM_SENT                    23534 non-null  int64  
 33  %MDM Key_FROM_SENT                        23526 non-null  float64
 34  %Calendar Key_FROM_SENT                   23534 non-null  object 
 35  %Campaign Key_FROM_SENT                   20202 non-null  float64
 36  %Journey Activity Key_FROM_SENT           23534 non-null  object 
 37  Subscriber_List ID_FROM_SENT              23534 non-null  object 
dtypes: bool(2), float64(8), int64(2), object(26)
memory usage: 6.5+ MB
None
(23534, 38)
Out[92]:
Unnamed: 0                                      0
SentId                                          0
ME Action Email                                 0
ME Event Type_FROM_SENT                         0
ME Event Type_FROM_CLICK_OR_OPEN                0
ME Event Date_FROM_SENT                         0
ME Event Date_FROM_CLICK_OR_OPEN                0
BOT_ACTIVITY_FROM_SENT                          0
BOT_ACTIVITY_FROM_CLICK_OR_OPEN                 0
Triggered Email Key_FROM_SENT               12408
%Country Mailing Key_FROM_CLICK_OR_OPEN         0
%Email Activity Key_FROM_CLICK_OR_OPEN          0
%User Activity Key_FROM_CLICK_OR_OPEN           0
%Mailing Key_FROM_CLICK_OR_OPEN                 0
%MDM Key_FROM_CLICK_OR_OPEN                     8
%Calendar Key_FROM_CLICK_OR_OPEN                0
%Journey Activity Key_FROM_CLICK_OR_OPEN        0
Subscriber_List ID_FROM_CLICK_OR_OPEN           0
ME Send URL_FROM_CLICK_OR_OPEN              17353
ME URL ID_FROM_CLICK_OR_OPEN                17353
ME Click URL_FROM_CLICK_OR_OPEN             17353
ME Alias_FROM_CLICK_OR_OPEN                 17353
Triggered Email Key_FROM_CLICK_OR_OPEN      12408
ME_Is_Unique_FROM_CLICK_OR_OPEN                 0
ME_Is_Unique_for_URL_FROM_CLICK_OR_OPEN     17353
ME Browser_FROM_CLICK_OR_OPEN                   0
ME Email Client_FROM_CLICK_OR_OPEN              0
ME OperatingSystem_FROM_CLICK_OR_OPEN           0
ME Device_FROM_CLICK_OR_OPEN                    0
%Country Mailing Key_FROM_SENT                  0
%Email Activity Key_FROM_SENT                   0
%User Activity Key_FROM_SENT                    0
%Mailing Key_FROM_SENT                          0
%MDM Key_FROM_SENT                              8
%Calendar Key_FROM_SENT                         0
%Campaign Key_FROM_SENT                      3332
%Journey Activity Key_FROM_SENT                 0
Subscriber_List ID_FROM_SENT                    0
dtype: int64
In [ ]:
 
In [ ]:
 
In [93]:
df.head()
Out[93]:
Unnamed: 0 SentId ME Action Email ME Event Type_FROM_SENT ME Event Type_FROM_CLICK_OR_OPEN ME Event Date_FROM_SENT ME Event Date_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_SENT BOT_ACTIVITY_FROM_CLICK_OR_OPEN Triggered Email Key_FROM_SENT %Country Mailing Key_FROM_CLICK_OR_OPEN %Email Activity Key_FROM_CLICK_OR_OPEN %User Activity Key_FROM_CLICK_OR_OPEN %Mailing Key_FROM_CLICK_OR_OPEN %MDM Key_FROM_CLICK_OR_OPEN %Calendar Key_FROM_CLICK_OR_OPEN %Journey Activity Key_FROM_CLICK_OR_OPEN Subscriber_List ID_FROM_CLICK_OR_OPEN ME Send URL_FROM_CLICK_OR_OPEN ME URL ID_FROM_CLICK_OR_OPEN ME Click URL_FROM_CLICK_OR_OPEN ME Alias_FROM_CLICK_OR_OPEN Triggered Email Key_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME_Is_Unique_for_URL_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN %Country Mailing Key_FROM_SENT %Email Activity Key_FROM_SENT %User Activity Key_FROM_SENT %Mailing Key_FROM_SENT %MDM Key_FROM_SENT %Calendar Key_FROM_SENT %Campaign Key_FROM_SENT %Journey Activity Key_FROM_SENT Subscriber_List ID_FROM_SENT
0 94321 NL|2553841|134552|2021-10-17 81@xs4all.nl Sent Open 2021-10-17 10:03:52 2021-10-17 10:07:54 False false 141619.0 NL|2553841 NL|2553841|134552|2021-10-17 NL|134552|2021-10-17 2553841.0 134552.0 10/17/2021 00:00:00 2EE4C618-3A2C-EC11-A305-1402EC938A50 240099621 - 5388 NaN NaN NaN NaN 141619.0 True NaN Safari iPhone iOS iPhone NL|2553841 NL|2553841|134552|2021-10-17 NL|134552|2021-10-17 2553841 134552.0 10/17/2021 00:00:00 3717.0 2EE4C618-3A2C-EC11-A305-1402EC938A50 240099621 - 5388
1 101257 NL|2585825|109216|2021-10-28 Apotheekdehamershof@xs4all.nl Sent Open 2021-10-28 10:03:45 2021-10-28 11:13:59 False false 143820.0 NL|2585825 NL|2585825|109216|2021-10-28 NL|109216|2021-10-28 2585825.0 109216.0 10/28/2021 00:00:00 A538359E-1C37-EC11-A30E-1402EC938A8C 311867813 - 5388 NaN NaN NaN NaN 143820.0 True NaN Unspecified Unspecified Unspecified Unspecified NL|2585825 NL|2585825|109216|2021-10-28 NL|109216|2021-10-28 2585825 109216.0 10/28/2021 00:00:00 3772.0 A538359E-1C37-EC11-A30E-1402EC938A8C 311867813 - 5388
2 95885 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Click 2021-11-09 10:02:16 2021-11-09 10:51:24 False false 143834.0 NL|2585841 NL|2585841|83606|2021-11-09 NL|83606|2021-11-09 2585841.0 83606.0 11/09/2021 00:00:00 23BC9EB0-1C37-EC11-A30E-1402EC938A8C 304646054 - 5388 27647496.0 1704520.0 https://image.mail.msdconnect.nl/lib/fe9013737... VAC_PNE_Mail3-Button6-Infographic_DOW_IMG 143834.0 True True Android Android Android Unspecified NL|2585841 NL|2585841|83606|2021-11-09 NL|83606|2021-11-09 2585841 83606.0 11/09/2021 00:00:00 3772.0 23BC9EB0-1C37-EC11-A30E-1402EC938A8C 304646054 - 5388
3 95886 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Click 2021-11-09 10:02:16 2021-11-09 10:51:57 False false 143834.0 NL|2585841 NL|2585841|83606|2021-11-09 NL|83606|2021-11-09 2585841.0 83606.0 11/09/2021 00:00:00 23BC9EB0-1C37-EC11-A30E-1402EC938A8C 304646054 - 5388 27647496.0 1704520.0 https://image.mail.msdconnect.nl/lib/fe9013737... VAC_PNE_Mail3-Button6-Infographic_DOW_IMG 143834.0 False False Android Android Android Unspecified NL|2585841 NL|2585841|83606|2021-11-09 NL|83606|2021-11-09 2585841 83606.0 11/09/2021 00:00:00 3772.0 23BC9EB0-1C37-EC11-A30E-1402EC938A8C 304646054 - 5388
4 109706 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Open 2021-11-09 10:02:16 2021-11-09 10:50:17 False false 143834.0 NL|2585841 NL|2585841|83606|2021-11-09 NL|83606|2021-11-09 2585841.0 83606.0 11/09/2021 00:00:00 23BC9EB0-1C37-EC11-A30E-1402EC938A8C 304646054 - 5388 NaN NaN NaN NaN 143834.0 True NaN Firefox Unspecified Windows XP PC NL|2585841 NL|2585841|83606|2021-11-09 NL|83606|2021-11-09 2585841 83606.0 11/09/2021 00:00:00 3772.0 23BC9EB0-1C37-EC11-A30E-1402EC938A8C 304646054 - 5388
In [ ]:
 
In [94]:
df[['ME Action Email']].eq('w.hermans@elisabeth.nl').sum()
Out[94]:
ME Action Email    436
dtype: int64
In [95]:
df_sub = df[['SentId','ME Action Email', 'ME Event Type_FROM_SENT',
       'ME Event Type_FROM_CLICK_OR_OPEN', 'ME Event Date_FROM_SENT',
       'ME Event Date_FROM_CLICK_OR_OPEN',
       'BOT_ACTIVITY_FROM_CLICK_OR_OPEN', 'ME_Is_Unique_FROM_CLICK_OR_OPEN', 
       'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN']]
df_sub.head(3)
Out[95]:
SentId ME Action Email ME Event Type_FROM_SENT ME Event Type_FROM_CLICK_OR_OPEN ME Event Date_FROM_SENT ME Event Date_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN
0 NL|2553841|134552|2021-10-17 81@xs4all.nl Sent Open 2021-10-17 10:03:52 2021-10-17 10:07:54 false True Safari iPhone iOS iPhone
1 NL|2585825|109216|2021-10-28 Apotheekdehamershof@xs4all.nl Sent Open 2021-10-28 10:03:45 2021-10-28 11:13:59 false True Unspecified Unspecified Unspecified Unspecified
2 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Click 2021-11-09 10:02:16 2021-11-09 10:51:24 false True Android Android Android Unspecified
In [96]:
# df_sub_sorted= df_sub.groupby(['ME Action Email', 'SentId']).apply(lambda x: x.sort_values('ME Event Date_FROM_CLICK_OR_OPEN'))
# df_sub_sorted.head(70)

Sort based on email and sentId and time

In [97]:
df_sub_sorted = df_sub.sort_values(by=['ME Action Email', 'SentId', 'ME Event Date_FROM_CLICK_OR_OPEN'], axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False, key=None)
In [98]:
df_sub_sorted.head(3)
Out[98]:
SentId ME Action Email ME Event Type_FROM_SENT ME Event Type_FROM_CLICK_OR_OPEN ME Event Date_FROM_SENT ME Event Date_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN
0 NL|2553841|134552|2021-10-17 81@xs4all.nl Sent Open 2021-10-17 10:03:52 2021-10-17 10:07:54 false True Safari iPhone iOS iPhone
1 NL|2585825|109216|2021-10-28 Apotheekdehamershof@xs4all.nl Sent Open 2021-10-28 10:03:45 2021-10-28 11:13:59 false True Unspecified Unspecified Unspecified Unspecified
4 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Open 2021-11-09 10:02:16 2021-11-09 10:50:17 false True Firefox Unspecified Windows XP PC
In [99]:
df_sub_sorted.columns
Out[99]:
Index(['SentId', 'ME Action Email', 'ME Event Type_FROM_SENT',
       'ME Event Type_FROM_CLICK_OR_OPEN', 'ME Event Date_FROM_SENT',
       'ME Event Date_FROM_CLICK_OR_OPEN', 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
       'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN',
       'ME Device_FROM_CLICK_OR_OPEN'],
      dtype='object')
In [100]:
# df_sub_sorted =df_sub.groupby(['ME Action Email', 'SentId', 'ME Event Date_FROM_CLICK_OR_OPEN']).size().reset_index(name='count')
# #.size().sort_values(ascending=False)
# df_sub_sorted.head(30)
In [101]:
df_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[101]:
false                        21037
Bot_click_model                929
Bot_open_model                 649
Bot_click_Honeypot             470
Bot_click_Honeypot_around      449
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [102]:
print(df_sub.nunique())
# print(df.columns)
# print(df.info())
# print(df.shape)
df_sub.isnull().sum()
SentId                                    8525
ME Action Email                           3716
ME Event Type_FROM_SENT                      1
ME Event Type_FROM_CLICK_OR_OPEN             2
ME Event Date_FROM_SENT                    174
ME Event Date_FROM_CLICK_OR_OPEN         15234
BOT_ACTIVITY_FROM_CLICK_OR_OPEN              5
ME_Is_Unique_FROM_CLICK_OR_OPEN              2
ME Browser_FROM_CLICK_OR_OPEN                9
ME Email Client_FROM_CLICK_OR_OPEN          12
ME OperatingSystem_FROM_CLICK_OR_OPEN       12
ME Device_FROM_CLICK_OR_OPEN                 5
dtype: int64
Out[102]:
SentId                                   0
ME Action Email                          0
ME Event Type_FROM_SENT                  0
ME Event Type_FROM_CLICK_OR_OPEN         0
ME Event Date_FROM_SENT                  0
ME Event Date_FROM_CLICK_OR_OPEN         0
BOT_ACTIVITY_FROM_CLICK_OR_OPEN          0
ME_Is_Unique_FROM_CLICK_OR_OPEN          0
ME Browser_FROM_CLICK_OR_OPEN            0
ME Email Client_FROM_CLICK_OR_OPEN       0
ME OperatingSystem_FROM_CLICK_OR_OPEN    0
ME Device_FROM_CLICK_OR_OPEN             0
dtype: int64

Add sent to open/click column

In [103]:
# change values to datetime, create new columns for dif and change to seconds
df_sub['ME Event Date_FROM_SENT'] = pd.to_datetime(df_sub['ME Event Date_FROM_SENT'])
df_sub['ME Event Date_FROM_CLICK_OR_OPEN'] = pd.to_datetime(df_sub['ME Event Date_FROM_CLICK_OR_OPEN'])

# df_sub['open_to_click'] = df_sub['ME Event Date_x'] - df_sub['ME Event Date_y']
# df_sub['sent_to_click'] = df_sub['ME Event Date_x'] - df_sub['ME Event Date_FROM_SENT']
df_sub['Sent_to_OpenClick'] = df_sub['ME Event Date_FROM_CLICK_OR_OPEN'] - df_sub['ME Event Date_FROM_SENT']

df_sub['Sent_to_OpenClick'] = df_sub['Sent_to_OpenClick']/np.timedelta64(1,'s')
# df_sub['sent_to_click'] = df_sub['sent_to_click']/np.timedelta64(1,'s')
# df_sub['sent_to_open'] = df_sub['sent_to_open']/np.timedelta64(1,'s')
In [104]:
df_sub.head()
Out[104]:
SentId ME Action Email ME Event Type_FROM_SENT ME Event Type_FROM_CLICK_OR_OPEN ME Event Date_FROM_SENT ME Event Date_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_to_OpenClick
0 NL|2553841|134552|2021-10-17 81@xs4all.nl Sent Open 2021-10-17 10:03:52 2021-10-17 10:07:54 false True Safari iPhone iOS iPhone 242.0
1 NL|2585825|109216|2021-10-28 Apotheekdehamershof@xs4all.nl Sent Open 2021-10-28 10:03:45 2021-10-28 11:13:59 false True Unspecified Unspecified Unspecified Unspecified 4214.0
2 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Click 2021-11-09 10:02:16 2021-11-09 10:51:24 false True Android Android Android Unspecified 2948.0
3 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Click 2021-11-09 10:02:16 2021-11-09 10:51:57 false False Android Android Android Unspecified 2981.0
4 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Open 2021-11-09 10:02:16 2021-11-09 10:50:17 false True Firefox Unspecified Windows XP PC 2881.0
In [105]:
plt.figure(figsize=(8,6))
plt.hist(df_sub['Sent_to_OpenClick'], bins=30, alpha=0.55, label="sent_to_OpenClick", color='purple')
# plt.hist(Final_joined_table_Open_Click_Sent['sent_to_click'], bins=90, alpha=0.99, label="sent_to_click", color='black')
# plt.hist(Final_joined_table_Open_Click_Sent['open_to_click'], bins=90, alpha=0.5, label="open_to_click", color='yellow')

plt.xlabel("time", size=14)
plt.ylabel("Count", size=14)
plt.title("Sent to Open/Click")
plt.legend(loc='upper right')
plt.savefig("overlapping_histograms_with_matplotlib_Python_2.png")
In [106]:
df_sub_Open = df_sub.loc[df_sub['ME Event Type_FROM_CLICK_OR_OPEN'].astype(str) == 'Open']
df_sub_Click = df_sub.loc[df_sub['ME Event Type_FROM_CLICK_OR_OPEN'].astype(str) == 'Click'] 
print('num of Clicks', df_sub_Click.shape)
print('num of Opens', df_sub_Open.shape)
print('Sents', df_sub.shape)
num of Clicks (6181, 13)
num of Opens (17353, 13)
Sents (23534, 13)
In [107]:
17353*6181
Out[107]:
107258893
In [108]:
Joining_Open_and_Click_3features = pd.merge(df_sub_Open, df_sub_Click, how = 'inner', on= ['ME Action Email', 'SentId'])
Joining_Open_and_Click_3features.shape
Out[108]:
(216423, 24)
In [109]:
Joining_Open_and_Click_3features.head(30)
Out[109]:
SentId ME Action Email ME Event Type_FROM_SENT_x ME Event Type_FROM_CLICK_OR_OPEN_x ME Event Date_FROM_SENT_x ME Event Date_FROM_CLICK_OR_OPEN_x BOT_ACTIVITY_FROM_CLICK_OR_OPEN_x ME_Is_Unique_FROM_CLICK_OR_OPEN_x ME Browser_FROM_CLICK_OR_OPEN_x ME Email Client_FROM_CLICK_OR_OPEN_x ME OperatingSystem_FROM_CLICK_OR_OPEN_x ME Device_FROM_CLICK_OR_OPEN_x Sent_to_OpenClick_x ME Event Type_FROM_SENT_y ME Event Type_FROM_CLICK_OR_OPEN_y ME Event Date_FROM_SENT_y ME Event Date_FROM_CLICK_OR_OPEN_y BOT_ACTIVITY_FROM_CLICK_OR_OPEN_y ME_Is_Unique_FROM_CLICK_OR_OPEN_y ME Browser_FROM_CLICK_OR_OPEN_y ME Email Client_FROM_CLICK_OR_OPEN_y ME OperatingSystem_FROM_CLICK_OR_OPEN_y ME Device_FROM_CLICK_OR_OPEN_y Sent_to_OpenClick_y
0 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Open 2021-11-09 10:02:16 2021-11-09 10:50:17 false True Firefox Unspecified Windows XP PC 2881.0 Sent Click 2021-11-09 10:02:16 2021-11-09 10:51:24 false True Android Android Android Unspecified 2948.0
1 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Open 2021-11-09 10:02:16 2021-11-09 10:50:17 false True Firefox Unspecified Windows XP PC 2881.0 Sent Click 2021-11-09 10:02:16 2021-11-09 10:51:57 false False Android Android Android Unspecified 2981.0
2 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Open 2021-11-09 10:02:16 2021-11-09 16:26:59 false False Firefox Unspecified Windows XP PC 23083.0 Sent Click 2021-11-09 10:02:16 2021-11-09 10:51:24 false True Android Android Android Unspecified 2948.0
3 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Open 2021-11-09 10:02:16 2021-11-09 16:26:59 false False Firefox Unspecified Windows XP PC 23083.0 Sent Click 2021-11-09 10:02:16 2021-11-09 10:51:57 false False Android Android Android Unspecified 2981.0
4 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Sent Open 2021-10-14 10:04:04 2021-10-14 10:56:22 false True Unspecified Unspecified Unspecified Unspecified 3138.0 Sent Click 2021-10-14 10:04:04 2021-10-14 10:56:37 false True Unspecified Unspecified Windows PC 3153.0
5 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Sent Open 2021-10-14 10:04:04 2021-10-14 10:56:22 false True Unspecified Unspecified Unspecified Unspecified 3138.0 Sent Click 2021-10-14 10:04:04 2021-10-14 10:57:28 false False Unspecified Unspecified Windows PC 3204.0
6 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Sent Open 2021-10-14 10:04:04 2021-10-14 11:27:09 false False Unspecified Unspecified Unspecified Unspecified 4985.0 Sent Click 2021-10-14 10:04:04 2021-10-14 10:56:37 false True Unspecified Unspecified Windows PC 3153.0
7 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Sent Open 2021-10-14 10:04:04 2021-10-14 11:27:09 false False Unspecified Unspecified Unspecified Unspecified 4985.0 Sent Click 2021-10-14 10:04:04 2021-10-14 10:57:28 false False Unspecified Unspecified Windows PC 3204.0
8 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Sent Open 2021-10-14 10:04:04 2021-10-14 13:40:53 false False Unspecified Unspecified Unspecified Unspecified 13009.0 Sent Click 2021-10-14 10:04:04 2021-10-14 10:56:37 false True Unspecified Unspecified Windows PC 3153.0
9 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Sent Open 2021-10-14 10:04:04 2021-10-14 13:40:53 false False Unspecified Unspecified Unspecified Unspecified 13009.0 Sent Click 2021-10-14 10:04:04 2021-10-14 10:57:28 false False Unspecified Unspecified Windows PC 3204.0
10 NL|2354993|114148|2021-07-20 Lammy.elving@radboudumc.nl Sent Open 2021-07-20 10:03:50 2021-07-20 12:02:09 false True Safari iPhone iOS iPhone 7099.0 Sent Click 2021-07-20 10:03:50 2021-07-20 12:04:15 false True Unspecified Unspecified Unspecified Unspecified 7225.0
11 NL|2553835|88990|2021-10-14 Marleen.Vosjan@wza.nl Sent Open 2021-10-14 10:04:04 2021-10-14 14:07:28 false True Unspecified Unspecified Windows Unspecified 14604.0 Sent Click 2021-10-14 10:04:04 2021-10-14 14:07:34 false True Unspecified Unspecified Windows PC 14610.0
12 NL|2540669|109737|2021-10-10 OPVOS@NEDERFOX.COM Sent Open 2021-10-10 10:02:55 2021-10-10 11:37:34 false True Android Android Android Unspecified 5679.0 Sent Click 2021-10-10 10:02:55 2021-10-10 11:29:31 false True Android Android Android Unspecified 5196.0
13 NL|2585828|103847|2021-10-28 a.becker@vumc.nl Sent Open 2021-10-28 10:03:44 2021-10-28 11:15:34 false True Unspecified Unspecified Windows PC 4310.0 Sent Click 2021-10-28 10:03:44 2021-10-28 11:15:46 false True Unspecified Unspecified Unspecified Unspecified 4322.0
14 NL|2585828|103847|2021-10-28 a.becker@vumc.nl Sent Open 2021-10-28 10:03:44 2021-10-28 11:15:34 false True Unspecified Unspecified Windows PC 4310.0 Sent Click 2021-10-28 10:03:44 2021-10-28 11:15:58 false False Unspecified Unspecified Windows PC 4334.0
15 NL|2585828|103847|2021-10-28 a.becker@vumc.nl Sent Open 2021-10-28 10:03:44 2021-10-28 11:15:34 false True Unspecified Unspecified Windows PC 4310.0 Sent Click 2021-10-28 10:03:44 2021-10-28 11:15:58 false False Unspecified Unspecified Windows PC 4334.0
16 NL|2553835|132473|2021-10-14 a.beijnen@plesmanapotheek.nl Sent Open 2021-10-14 10:04:00 2021-10-14 10:17:15 false True Unspecified Unspecified Windows PC 795.0 Sent Click 2021-10-14 10:04:00 2021-10-14 10:17:15 false False Unspecified Unspecified Windows PC 795.0
17 NL|2553835|132473|2021-10-14 a.beijnen@plesmanapotheek.nl Sent Open 2021-10-14 10:04:00 2021-10-14 10:17:15 false True Unspecified Unspecified Windows PC 795.0 Sent Click 2021-10-14 10:04:00 2021-10-14 10:17:22 false True Unspecified Unspecified Windows PC 802.0
18 NL|2585836|107341|2021-10-31 a.braam@rivas.nl Sent Open 2021-10-31 10:02:53 2021-10-31 13:53:26 false True Unspecified Unspecified Unspecified Unspecified 13833.0 Sent Click 2021-10-31 10:02:53 2021-10-31 13:53:42 false True Safari Apple Mail 1 Mac OS X 10.1 Macintosh 13849.0
19 NL|2585836|107341|2021-10-31 a.braam@rivas.nl Sent Open 2021-10-31 10:02:53 2021-10-31 13:53:26 false True Unspecified Unspecified Unspecified Unspecified 13833.0 Sent Click 2021-10-31 10:02:53 2021-10-31 13:53:45 false False Safari Apple Mail 1 Mac OS X 10.1 Macintosh 13852.0
20 NL|2585836|107341|2021-10-31 a.braam@rivas.nl Sent Open 2021-10-31 10:02:53 2021-10-31 13:53:26 false True Unspecified Unspecified Unspecified Unspecified 13833.0 Sent Click 2021-10-31 10:02:53 2021-10-31 13:53:46 false False Unspecified Unspecified Windows PC 13853.0
21 NL|2585836|107341|2021-10-31 a.braam@rivas.nl Sent Open 2021-10-31 10:02:53 2021-10-31 13:53:26 false True Unspecified Unspecified Unspecified Unspecified 13833.0 Sent Click 2021-10-31 10:02:53 2021-10-31 13:54:14 false False Firefox 10 Unspecified Windows PC 13881.0
22 NL|2141027|125232|2021-04-15 a.coumou@etz.nl Sent Open 2021-04-15 16:01:02 2021-04-15 16:58:29 false True Unspecified Apple Mail 1 Mac OS X 10.1 Macintosh 3447.0 Sent Click 2021-04-15 16:01:02 2021-04-15 16:01:06 false True Chrome Unspecified Windows 7 PC 4.0
23 NL|2315451|125232|2021-07-01 a.coumou@etz.nl Sent Open 2021-07-01 10:00:50 2021-07-01 10:00:56 Bot_open_model True Chrome Unspecified Windows 7 PC 6.0 Sent Click 2021-07-01 10:00:50 2021-07-01 10:00:56 false True Chrome Unspecified Windows 7 PC 6.0
24 NL|2333171|125232|2021-07-11 a.coumou@etz.nl Sent Open 2021-07-11 10:00:40 2021-07-11 10:00:44 Bot_open_model True Chrome Unspecified Windows 7 PC 4.0 Sent Click 2021-07-11 10:00:40 2021-07-11 10:00:44 false True Chrome Unspecified Windows 7 PC 4.0
25 NL|2333171|125232|2021-07-11 a.coumou@etz.nl Sent Open 2021-07-11 10:00:40 2021-07-11 10:00:44 Bot_open_model True Chrome Unspecified Windows 7 PC 4.0 Sent Click 2021-07-11 10:00:40 2021-07-11 10:00:45 false False Chrome Unspecified Windows 7 PC 5.0
26 NL|2333171|125232|2021-07-11 a.coumou@etz.nl Sent Open 2021-07-11 10:00:40 2021-07-11 10:00:45 false False Chrome Unspecified Windows 7 PC 5.0 Sent Click 2021-07-11 10:00:40 2021-07-11 10:00:44 false True Chrome Unspecified Windows 7 PC 4.0
27 NL|2333171|125232|2021-07-11 a.coumou@etz.nl Sent Open 2021-07-11 10:00:40 2021-07-11 10:00:45 false False Chrome Unspecified Windows 7 PC 5.0 Sent Click 2021-07-11 10:00:40 2021-07-11 10:00:45 false False Chrome Unspecified Windows 7 PC 5.0
28 NL|2553835|109311|2021-10-14 a.debruijn20@telfort.nl Sent Open 2021-10-14 10:04:00 2021-10-14 10:27:07 false True Safari iPad iOS iPad 1387.0 Sent Click 2021-10-14 10:04:00 2021-10-14 10:27:39 false True Safari iPad iOS iPad 1419.0
29 NL|2553835|109311|2021-10-14 a.debruijn20@telfort.nl Sent Open 2021-10-14 10:04:00 2021-10-14 10:43:53 false False Safari iPad iOS iPad 2393.0 Sent Click 2021-10-14 10:04:00 2021-10-14 10:27:39 false True Safari iPad iOS iPad 1419.0
In [110]:
df_sub.columns
Out[110]:
Index(['SentId', 'ME Action Email', 'ME Event Type_FROM_SENT',
       'ME Event Type_FROM_CLICK_OR_OPEN', 'ME Event Date_FROM_SENT',
       'ME Event Date_FROM_CLICK_OR_OPEN', 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
       'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
       'Sent_to_OpenClick'],
      dtype='object')
In [ ]:
 
In [111]:
df_sub_sub.head
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-111-7d65f093d112> in <module>
----> 1 df_sub_sub.head

NameError: name 'df_sub_sub' is not defined
In [ ]:
 
In [ ]:
 
In [25]:
df_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[25]:
false                        21037
Bot_click_model                929
Bot_open_model                 649
Bot_click_Honeypot             470
Bot_click_Honeypot_around      449
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64

Modeling

Pycaret Anomaly

  • iforest
In [23]:
### Just keep honeypot labels - O, 1 ###

df_sub_sub1 = df_sub[['BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
       'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN'
       ]]#,'Sent_to_OpenClick', 'refrence_time_to_OpenClick'

df_sub_sub1 = df_sub_sub1.loc[df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_sub_sub1 = df_sub_sub1.loc[df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', 0)
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
# df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',1)
# df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',1)


######### Pycaret ###########
#############################
# Importing module and initializing setup
from pycaret.anomaly import *
ano1 = setup(data = df_sub_sub1, preprocess=True, normalize= True)#, fraction = 0.05
# creating a model

iforest = create_model('sos')
# plotting a model
#plot_model(iforest, plot = 'tsne')#, feature = None, label = False, scale = 1, save False, display_format = None 
Description Value
0 session_id 4557
1 Original Data (21956, 6)
2 Missing Values False
3 Numeric Features 0
4 Categorical Features 6
5 Ordinal Features False
6 High Cardinality Features False
7 High Cardinality Method None
8 Transformed Data (21956, 42)
9 CPU Jobs -1
10 Use GPU False
11 Log Experiment False
12 Experiment Name anomaly-default-name
13 USI 028e
14 Imputation Type simple
15 Iterative Imputation Iteration None
16 Numeric Imputer mean
17 Iterative Imputation Numeric Model None
18 Categorical Imputer mode
19 Iterative Imputation Categorical Model None
20 Unknown Categoricals Handling least_frequent
21 Normalize True
22 Normalize Method zscore
23 Transformation False
24 Transformation Method None
25 PCA False
26 PCA Method None
27 PCA Components None
28 Ignore Low Variance False
29 Combine Rare Levels False
30 Rare Level Threshold None
31 Numeric Binning False
32 Remove Outliers False
33 Outliers Threshold None
34 Remove Multicollinearity False
35 Multicollinearity Threshold None
36 Remove Perfect Collinearity False
37 Clustering False
38 Clustering Iteration None
39 Polynomial Features False
40 Polynomial Degree None
41 Trignometry Features False
42 Polynomial Threshold None
43 Group Features False
44 Feature Selection False
45 Feature Selection Method classic
46 Features Selection Threshold None
47 Feature Interaction False
48 Feature Ratio False
49 Interaction Threshold None
Initiated . . . . . . . . . . . . . . . . . . 02:20:48
Status . . . . . . . . . . . . . . . . . . Compiling Final Models
Estimator . . . . . . . . . . . . . . . . . . Extra Trees Classifier
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-23-8e01ad6ce7c6> in <module>
     23 # creating a model
     24 
---> 25 iforest = create_model('sos')
     26 # plotting a model
     27 #plot_model(iforest, plot = 'tsne')#, feature = None, label = False, scale = 1, save False, display_format = None

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/anomaly.py in create_model(model, fraction, verbose, fit_kwargs, **kwargs)
    508         fit_kwargs=fit_kwargs,
    509         verbose=verbose,
--> 510         **kwargs,
    511     )
    512 

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/internal/tabular.py in create_model_unsupervised(estimator, num_clusters, fraction, ground_truth, round, fit_kwargs, verbose, system, add_to_model_list, raise_num_clusters, X_data, display, **kwargs)
   2677                         )
   2678             else:
-> 2679                 pipeline_with_model.fit(data_X, **fit_kwargs)
   2680         model_fit_end = time.time()
   2681 

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/internal/pipeline.py in fit(self, X, y, **fit_kwargs)
    116 
    117     def fit(self, X, y=None, **fit_kwargs):
--> 118         result = super().fit(X, y=y, **fit_kwargs)
    119 
    120         self._carry_over_final_estimator_fit_vars()

~/anaconda3/envs/python3/lib/python3.6/site-packages/imblearn/pipeline.py in fit(self, X, y, **fit_params)
    279                                  self._log_message(len(self.steps) - 1)):
    280             if self._final_estimator != "passthrough":
--> 281                 self._final_estimator.fit(Xt, yt, **fit_params)
    282         return self
    283 

~/anaconda3/envs/python3/lib/python3.6/site-packages/pyod/models/sos.py in fit(self, X, y)
    273         self._set_n_classes(y)
    274         D = self._x2d(X)
--> 275         A = self._d2a(D)
    276         B = self._a2b(A)
    277         O = self._b2o(B)

~/anaconda3/envs/python3/lib/python3.6/site-packages/pyod/models/sos.py in _d2a(self, D)
    210                         beta[i] = (beta[i] + betamin) / 2.0
    211                 # Recompute the values
--> 212                 (H, thisA) = _get_perplexity(Di, beta[i])
    213                 Hdiff = H - logU
    214                 tries += 1

KeyboardInterrupt: 
In [98]:
all_models = models()
all_models
Out[98]:
Name Reference
ID
abod Angle-base Outlier Detection pyod.models.abod.ABOD
cluster Clustering-Based Local Outlier pyod.models.cblof.CBLOF
cof Connectivity-Based Local Outlier pyod.models.cof.COF
iforest Isolation Forest pyod.models.iforest.IForest
histogram Histogram-based Outlier Detection pyod.models.hbos.HBOS
knn K-Nearest Neighbors Detector pyod.models.knn.KNN
lof Local Outlier Factor pyod.models.lof.LOF
svm One-class SVM detector pyod.models.ocsvm.OCSVM
pca Principal Component Analysis pyod.models.pca.PCA
mcd Minimum Covariance Determinant pyod.models.mcd.MCD
sod Subspace Outlier Detection pyod.models.sod.SOD
sos Stochastic Outlier Selection pyod.models.sos.SOS

Assign an anomaly label

In [ ]:
iforest_anomoly_label = assign_model(iforest, score = True, verbose= True) 
iforest_anomoly_label
In [ ]:
confusion_matrix = pd.crosstab(iforest_anomoly_label['Anomaly'], iforest_anomoly_label['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

One class SVM

In [ ]:
svm = create_model('svm')
# plotting a model
plot_model(svm, plot = 'tsne')
In [ ]:
evaluate_model(svm)
In [ ]:
evaluate_model(iforest)
In [ ]:
svm_anomoly_label = assign_model(svm, score = True, verbose= True) 
svm_anomoly_label
In [ ]:
iforest_predictions = predict_model(model = svm, data = df_sub_sub)
iforest_predictions
In [ ]:
confusion_matrix = pd.crosstab(svm_anomoly_label['Anomaly'], svm_anomoly_label['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)

Skit-learn anaomaly

In [35]:
# !pip install -U scikit-learn
# !pip install --upgrade sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score


# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.01)
In [ ]:
## X and y defenition for train_test_split
df_sub_sub = df_sub[['BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
       'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
       'Sent_to_OpenClick', 'refrence_time_to_OpenClick']]

# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)

df_sub_sub_x = df_sub_sub[['ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
       'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
       'Sent_to_OpenClick']]

## One -hot -encoding
one_hot_data = pd.get_dummies(df_sub_sub_x[['ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
       'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN']], drop_first=True, dtype=float)#

# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false',1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around',-1)

X, y = one_hot_data, df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)
# # fit on majority class
X_train = X_train[y_train==1]
model.fit(X_train)

# ...
# detect outliers in the test set
yhat = model.predict(X_test)


...
# # mark inliers 1, outliers -1
y_test[y_test == 1] = 1
y_test[y_test == -1] = -1


y_test=y_test.to_numpy()
y_test=y_test.astype(str).astype(int)

# calculate score
score = f1_score(y_test, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)
In [ ]:
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
In [ ]:
y_test
In [ ]:
yhat
In [ ]:
yhat.shape
#_test.shape

Isolation

In [40]:
# isolation forest for imbalanced classification
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import IsolationForest

# define outlier detection model
model = IsolationForest(contamination=0.4, behaviour='new')
# fit on majority class
model.fit(X_train)

# ...
# detect outliers in the test set
yhat = model.predict(X_test)


...
# # mark inliers 1, outliers -1
y_test[y_test == '1'] = 1
y_test[y_test == '-1'] = -1


# y_test=y_test.to_numpy()
# y_test=y_test.astype(str).astype(int)

# calculate score
score = f1_score(y_test, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-40-15a732706845> in <module>
      8 model = IsolationForest(contamination=0.4, behaviour='new')
      9 # fit on majority class
---> 10 model.fit(X_train)
     11 
     12 # ...

NameError: name 'X_train' is not defined
In [ ]:
 
In [ ]:
 
In [ ]:
 

Pycaret Classification

In [271]:
# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', 0)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot',1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around',1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',1)
In [272]:
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[272]:
-1    21037
 1      919
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [273]:
import pycaret
from pycaret.classification import *
exp_clf101 = setup(data = df_sub_sub,
                    ignore_low_variance= True , combine_rare_levels= True,
                   remove_multicollinearity= True,  use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #, profile= True, high_cardinality_features=['emailclient', 'operatingsystem'], transformation= True,feature_selection= True,
Initiated . . . . . . . . . . . . . . . . . . 00:10:47
Status . . . . . . . . . . . . . . . . . . Fitting 10 Folds
Estimator . . . . . . . . . . . . . . . . . . Gradient Boosting Classifier
Data Type
SentId Categorical
ME Action Email Categorical
ME Event Type_FROM_CLICK_OR_OPEN Categorical
ME_Is_Unique_FROM_CLICK_OR_OPEN Categorical
ME Browser_FROM_CLICK_OR_OPEN Categorical
ME Email Client_FROM_CLICK_OR_OPEN Categorical
ME OperatingSystem_FROM_CLICK_OR_OPEN Categorical
ME Device_FROM_CLICK_OR_OPEN Categorical
Sent_Time Date
Open_Time Date
Click_Time Date
BOT_ACTIVITY_FROM_CLICK_OR_OPEN Label
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-273-e8666617c90f> in <module>
      3 exp_clf101 = setup(data = df_sub_sub,
      4                     ignore_low_variance= True , combine_rare_levels= True,
----> 5                    remove_multicollinearity= True,  use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #, profile= True, high_cardinality_features=['emailclient', 'operatingsystem'], transformation= True,feature_selection= True,

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/classification.py in setup(data, target, train_size, test_data, preprocess, imputation_type, iterative_imputation_iters, categorical_features, categorical_imputation, categorical_iterative_imputer, ordinal_features, high_cardinality_features, high_cardinality_method, numeric_features, numeric_imputation, numeric_iterative_imputer, date_features, ignore_features, normalize, normalize_method, transformation, transformation_method, handle_unknown_categorical, unknown_categorical_method, pca, pca_method, pca_components, ignore_low_variance, combine_rare_levels, rare_level_threshold, bin_numeric_features, remove_outliers, outliers_threshold, remove_multicollinearity, multicollinearity_threshold, remove_perfect_collinearity, create_clusters, cluster_iter, polynomial_features, polynomial_degree, trigonometry_features, polynomial_threshold, group_features, group_names, feature_selection, feature_selection_threshold, feature_selection_method, feature_interaction, feature_ratio, interaction_threshold, fix_imbalance, fix_imbalance_method, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, custom_pipeline, html, session_id, log_experiment, experiment_name, log_plots, log_profile, log_data, silent, verbose, profile, profile_kwargs)
    653         verbose=verbose,
    654         profile=profile,
--> 655         profile_kwargs=profile_kwargs,
    656     )
    657 

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/internal/tabular.py in setup(data, target, ml_usecase, available_plots, train_size, test_data, preprocess, imputation_type, iterative_imputation_iters, categorical_features, categorical_imputation, categorical_iterative_imputer, ordinal_features, high_cardinality_features, high_cardinality_method, numeric_features, numeric_imputation, numeric_iterative_imputer, date_features, ignore_features, normalize, normalize_method, transformation, transformation_method, handle_unknown_categorical, unknown_categorical_method, pca, pca_method, pca_components, ignore_low_variance, combine_rare_levels, rare_level_threshold, bin_numeric_features, remove_outliers, outliers_threshold, remove_multicollinearity, multicollinearity_threshold, remove_perfect_collinearity, create_clusters, cluster_iter, polynomial_features, polynomial_degree, trigonometry_features, polynomial_threshold, group_features, group_names, feature_selection, feature_selection_threshold, feature_selection_method, feature_interaction, feature_ratio, interaction_threshold, fix_imbalance, fix_imbalance_method, transform_target, transform_target_method, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, custom_pipeline, html, session_id, log_experiment, experiment_name, log_plots, log_profile, log_data, silent, verbose, profile, profile_kwargs, display)
   1328             test_data = pd.concat([X_test, y_test], axis=1)
   1329 
-> 1330         train_data = prep_pipe.fit_transform(train_data)
   1331         # workaround to also transform target
   1332         dtypes.final_training_columns.append(target)

~/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    365         """
    366         fit_params_steps = self._check_fit_params(**fit_params)
--> 367         Xt = self._fit(X, y, **fit_params_steps)
    368 
    369         last_step = self._final_estimator

~/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
    294                 message_clsname='Pipeline',
    295                 message=self._log_message(step_idx),
--> 296                 **fit_params_steps[name])
    297             # Replace the transformer of the step with the fitted
    298             # transformer. This is necessary when loading the transformer

~/anaconda3/envs/python3/lib/python3.6/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
    350 
    351     def __call__(self, *args, **kwargs):
--> 352         return self.func(*args, **kwargs)
    353 
    354     def call_and_shelve(self, *args, **kwargs):

~/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
    738     with _print_elapsed_time(message_clsname, message):
    739         if hasattr(transformer, 'fit_transform'):
--> 740             res = transformer.fit_transform(X, y, **fit_params)
    741         else:
    742             res = transformer.fit(X, y, **fit_params).transform(X)

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/internal/preprocess.py in fit_transform(self, dataset, y)
    419 
    420         # since this is for training , we dont nees any transformation since it has already been transformed in fit
--> 421         data = self.fit(data)
    422 
    423         # additionally we just need to treat the target variable

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/internal/preprocess.py in fit(self, dataset, y)
    321 
    322             display(dt_print_out[["Data Type"]])
--> 323             self.response = input()
    324 
    325             if self.response in [

~/anaconda3/envs/python3/lib/python3.6/site-packages/ipykernel/kernelbase.py in raw_input(self, prompt)
    849             self._parent_ident,
    850             self._parent_header,
--> 851             password=False,
    852         )
    853 

~/anaconda3/envs/python3/lib/python3.6/site-packages/ipykernel/kernelbase.py in _input_request(self, prompt, ident, parent, password)
    890             except KeyboardInterrupt:
    891                 # re-raise KeyboardInterrupt, to truncate traceback
--> 892                 raise KeyboardInterrupt("Interrupted by user") from None
    893             except Exception as e:
    894                 self.log.warning("Invalid Message:", exc_info=True)

KeyboardInterrupt: Interrupted by user
In [115]:
### Train
#(TIP : It's always good to look at the standard deviation of CV results when using create_model()
best_model = compare_models()
Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
ada Ada Boost Classifier 0.9887 0.9967 0.9904 0.9979 0.9941 0.8485 0.8528 0.2610
gbc Gradient Boosting Classifier 0.9877 0.9965 0.9900 0.9973 0.9936 0.8351 0.8391 0.5680
knn K Neighbors Classifier 0.9871 0.9961 0.9909 0.9958 0.9933 0.8214 0.8239 0.5850
dt Decision Tree Classifier 0.9870 0.9866 0.9904 0.9962 0.9933 0.8219 0.8245 0.0150
lightgbm Light Gradient Boosting Machine 0.9870 0.9962 0.9909 0.9957 0.9933 0.8195 0.8216 0.0860
rf Random Forest Classifier 0.9867 0.9965 0.9910 0.9952 0.9931 0.8130 0.8146 0.9200
et Extra Trees Classifier 0.9864 0.9914 0.9900 0.9959 0.9929 0.8137 0.8165 0.9030
lr Logistic Regression 0.9862 0.9958 0.9909 0.9948 0.9928 0.8055 0.8069 0.1780
svm SVM - Linear Kernel 0.9829 0.0000 0.9877 0.9945 0.9911 0.7674 0.7715 0.3660
nb Naive Bayes 0.9827 0.9927 0.9821 1.0000 0.9910 0.7958 0.8134 0.0150
ridge Ridge Classifier 0.9770 0.0000 0.9804 0.9958 0.9880 0.7147 0.7303 0.0240
lda Linear Discriminant Analysis 0.9705 0.9897 0.9694 1.0000 0.9845 0.6914 0.7271 0.0740
qda Quadratic Discriminant Analysis 0.9649 0.5000 1.0000 0.9649 0.9821 0.0000 0.0000 0.0330
In [ ]:
 
In [ ]:
 
In [116]:
df_sub.head()
Out[116]:
SentId ME Action Email ME Event Type_FROM_SENT ME Event Type_FROM_CLICK_OR_OPEN ME Event Date_FROM_SENT ME Event Date_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_to_OpenClick
0 NL|2553841|134552|2021-10-17 81@xs4all.nl Sent Open 2021-10-17 10:03:52 2021-10-17 10:07:54 false True Safari iPhone iOS iPhone 242.0
1 NL|2585825|109216|2021-10-28 Apotheekdehamershof@xs4all.nl Sent Open 2021-10-28 10:03:45 2021-10-28 11:13:59 false True Unspecified Unspecified Unspecified Unspecified 4214.0
2 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Click 2021-11-09 10:02:16 2021-11-09 10:51:24 false True Android Android Android Unspecified 2948.0
3 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Click 2021-11-09 10:02:16 2021-11-09 10:51:57 false False Android Android Android Unspecified 2981.0
4 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Sent Open 2021-11-09 10:02:16 2021-11-09 10:50:17 false True Firefox Unspecified Windows XP PC 2881.0
In [ ]:
df_sub['refrence_time']= '2021-01-01 00:00:00'
df_sub['ME Event Date_FROM_CLICK_OR_OPEN'] = pd.to_datetime(df_sub['ME Event Date_FROM_CLICK_OR_OPEN'])
df_sub['refrence_time'] = pd.to_datetime(df_sub['refrence_time'])
df_sub['refrence_time_to_OpenClick']= df_sub['ME Event Date_FROM_CLICK_OR_OPEN'] - df_sub['refrence_time']
df_sub['refrence_time_to_OpenClick']= df_sub['refrence_time_to_OpenClick']/np.timedelta64(1,'s')
In [ ]:
df_sub.head()
In [ ]:
df_sub_sorted = df_sub.sort_values(by=['ME Action Email', 'SentId', 'ME Event Date_FROM_CLICK_OR_OPEN'], axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False, key=None)
In [ ]:
df_sub_sorted.head()
In [ ]:
 
In [ ]:
 

New join with three new time features

In [208]:
df_3 = pd.read_csv('add_sent_open_click_time_23534_40.csv')
print(df_3.shape)
df_3.isnull().sum()
(23534, 41)
Out[208]:
Unnamed: 0                                      0
SentId                                          0
ME Action Email                                 0
ME Event Type_FROM_SENT                         0
ME Event Type_FROM_CLICK_OR_OPEN                0
ME Event Date_FROM_SENT                         0
ME Event Date_FROM_CLICK_OR_OPEN                0
BOT_ACTIVITY_FROM_SENT                          0
BOT_ACTIVITY_FROM_CLICK_OR_OPEN                 0
Triggered Email Key_FROM_SENT               12408
%Country Mailing Key_FROM_CLICK_OR_OPEN         0
%Email Activity Key_FROM_CLICK_OR_OPEN          0
%User Activity Key_FROM_CLICK_OR_OPEN           0
%Mailing Key_FROM_CLICK_OR_OPEN                 0
%MDM Key_FROM_CLICK_OR_OPEN                     8
%Calendar Key_FROM_CLICK_OR_OPEN                0
%Journey Activity Key_FROM_CLICK_OR_OPEN        0
Subscriber_List ID_FROM_CLICK_OR_OPEN           0
ME Send URL_FROM_CLICK_OR_OPEN              17353
ME URL ID_FROM_CLICK_OR_OPEN                17353
ME Click URL_FROM_CLICK_OR_OPEN             17353
ME Alias_FROM_CLICK_OR_OPEN                 17353
Triggered Email Key_FROM_CLICK_OR_OPEN      12408
ME_Is_Unique_FROM_CLICK_OR_OPEN                 0
ME_Is_Unique_for_URL_FROM_CLICK_OR_OPEN     17353
ME Browser_FROM_CLICK_OR_OPEN                   0
ME Email Client_FROM_CLICK_OR_OPEN              0
ME OperatingSystem_FROM_CLICK_OR_OPEN           0
ME Device_FROM_CLICK_OR_OPEN                    0
%Country Mailing Key_FROM_SENT                  0
%Email Activity Key_FROM_SENT                   0
%User Activity Key_FROM_SENT                    0
%Mailing Key_FROM_SENT                          0
%MDM Key_FROM_SENT                              8
%Calendar Key_FROM_SENT                         0
%Campaign Key_FROM_SENT                      3332
%Journey Activity Key_FROM_SENT                 0
Subscriber_List ID_FROM_SENT                    0
Sent_Time                                       0
Open_Time                                       0
Click_Time                                  17353
dtype: int64
In [209]:
df_3.columns
Out[209]:
Index(['Unnamed: 0', 'SentId', 'ME Action Email', 'ME Event Type_FROM_SENT',
       'ME Event Type_FROM_CLICK_OR_OPEN', 'ME Event Date_FROM_SENT',
       'ME Event Date_FROM_CLICK_OR_OPEN', 'BOT_ACTIVITY_FROM_SENT',
       'BOT_ACTIVITY_FROM_CLICK_OR_OPEN', 'Triggered Email Key_FROM_SENT',
       '%Country Mailing Key_FROM_CLICK_OR_OPEN',
       '%Email Activity Key_FROM_CLICK_OR_OPEN',
       '%User Activity Key_FROM_CLICK_OR_OPEN',
       '%Mailing Key_FROM_CLICK_OR_OPEN', '%MDM Key_FROM_CLICK_OR_OPEN',
       '%Calendar Key_FROM_CLICK_OR_OPEN',
       '%Journey Activity Key_FROM_CLICK_OR_OPEN',
       'Subscriber_List ID_FROM_CLICK_OR_OPEN',
       'ME Send URL_FROM_CLICK_OR_OPEN', 'ME URL ID_FROM_CLICK_OR_OPEN',
       'ME Click URL_FROM_CLICK_OR_OPEN', 'ME Alias_FROM_CLICK_OR_OPEN',
       'Triggered Email Key_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_for_URL_FROM_CLICK_OR_OPEN',
       'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
       '%Country Mailing Key_FROM_SENT', '%Email Activity Key_FROM_SENT',
       '%User Activity Key_FROM_SENT', '%Mailing Key_FROM_SENT',
       '%MDM Key_FROM_SENT', '%Calendar Key_FROM_SENT',
       '%Campaign Key_FROM_SENT', '%Journey Activity Key_FROM_SENT',
       'Subscriber_List ID_FROM_SENT', 'Sent_Time', 'Open_Time', 'Click_Time'],
      dtype='object')
In [210]:
df_3_sub['ME Device_FROM_CLICK_OR_OPEN'].value_counts()
Out[210]:
PC             9203
Unspecified    7840
iPhone         4448
Macintosh      1899
iPad            144
Name: ME Device_FROM_CLICK_OR_OPEN, dtype: int64
In [211]:
df_3_sub = df_3[['SentId','ME Action Email', 'ME Event Type_FROM_CLICK_OR_OPEN',
       'BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_FROM_CLICK_OR_OPEN', 
       'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN', 'Sent_Time', 'Open_Time', 'Click_Time']]
df_3_sub.head(30)
Out[211]:
SentId ME Action Email ME Event Type_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_Time Open_Time Click_Time
0 NL|2553841|134552|2021-10-17 81@xs4all.nl Open false True Safari iPhone iOS iPhone 2021-10-17 10:03:52 2021-10-17 10:07:54 NaN
1 NL|2585825|109216|2021-10-28 Apotheekdehamershof@xs4all.nl Open false True Unspecified Unspecified Unspecified Unspecified 2021-10-28 10:03:45 2021-10-28 11:13:59 NaN
2 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Open false True Firefox Unspecified Windows XP PC 2021-11-09 10:02:16 2021-11-09 10:50:17 NaN
3 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Open false False Firefox Unspecified Windows XP PC 2021-11-09 10:02:16 2021-11-09 16:26:59 NaN
4 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Click false True Android Android Android Unspecified 2021-11-09 10:02:16 2021-11-09 16:26:59 2021-11-09 10:51:24
5 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Click false False Android Android Android Unspecified 2021-11-09 10:02:16 2021-11-09 16:26:59 2021-11-09 10:51:57
6 NL|2553837|130706|2021-10-17 HDOEI@TELENET.BE Open false True Safari iPhone iOS iPhone 2021-10-17 10:03:43 2021-10-17 20:34:23 NaN
7 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Open false True Unspecified Unspecified Unspecified Unspecified 2021-10-14 10:04:04 2021-10-14 10:56:22 NaN
8 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Open false False Unspecified Unspecified Unspecified Unspecified 2021-10-14 10:04:04 2021-10-14 11:27:09 NaN
9 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Open false False Unspecified Unspecified Unspecified Unspecified 2021-10-14 10:04:04 2021-10-14 13:40:53 NaN
10 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Click false True Unspecified Unspecified Windows PC 2021-10-14 10:04:04 2021-10-14 13:40:53 2021-10-14 10:56:37
11 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Click false False Unspecified Unspecified Windows PC 2021-10-14 10:04:04 2021-10-14 13:40:53 2021-10-14 10:57:28
12 NL|2354993|114148|2021-07-20 Lammy.elving@radboudumc.nl Open false True Safari iPhone iOS iPhone 2021-07-20 10:03:50 2021-07-20 12:02:09 NaN
13 NL|2354993|114148|2021-07-20 Lammy.elving@radboudumc.nl Click false True Unspecified Unspecified Unspecified Unspecified 2021-07-20 10:03:50 2021-07-20 12:02:09 2021-07-20 12:04:15
14 NL|2553835|88990|2021-10-14 Marleen.Vosjan@wza.nl Click false True Unspecified Unspecified Windows PC 2021-10-14 10:04:04 2021-10-14 14:07:28 2021-10-14 14:07:34
15 NL|2553835|88990|2021-10-14 Marleen.Vosjan@wza.nl Open false True Unspecified Unspecified Windows Unspecified 2021-10-14 10:04:04 2021-10-14 14:07:28 NaN
16 NL|2540669|109737|2021-10-10 OPVOS@NEDERFOX.COM Click false True Android Android Android Unspecified 2021-10-10 10:02:55 2021-10-10 11:37:34 2021-10-10 11:29:31
17 NL|2540669|109737|2021-10-10 OPVOS@NEDERFOX.COM Open false True Android Android Android Unspecified 2021-10-10 10:02:55 2021-10-10 11:37:34 NaN
18 NL|2536355|102563|2021-10-07 a.ali@apotheekmonster.nl Open false True Unspecified Thunderbird Windows PC 2021-10-07 10:02:04 2021-10-07 10:15:42 NaN
19 NL|2105290|102563|2021-03-31 a.ali@apotheekmonster.nl Open false True Unspecified Thunderbird Windows PC 2021-03-31 10:00:59 2021-03-31 12:11:21 NaN
20 NL|2536370|102563|2021-10-13 a.ali@apotheekmonster.nl Open false True Unspecified Thunderbird Windows PC 2021-10-13 10:02:54 2021-10-13 10:05:27 NaN
21 NL|2585825|102563|2021-10-28 a.ali@apotheekmonster.nl Open false True Unspecified Thunderbird Windows PC 2021-10-28 10:03:46 2021-10-28 10:05:39 NaN
22 NL|2354993|102563|2021-07-20 a.ali@apotheekmonster.nl Open false False Unspecified Apple Mail 1 Mac OS X 10.1 Macintosh 2021-07-20 10:04:08 2021-07-20 13:28:45 NaN
23 NL|2354993|102563|2021-07-20 a.ali@apotheekmonster.nl Open false False Unspecified Apple Mail 1 Mac OS X 10.1 Macintosh 2021-07-20 10:04:08 2021-07-20 13:26:09 NaN
24 NL|2354993|102563|2021-07-20 a.ali@apotheekmonster.nl Open false True Unspecified Apple Mail 1 Mac OS X 10.1 Macintosh 2021-07-20 10:04:08 2021-07-20 13:26:08 NaN
25 NL|2354993|140133|2021-07-20 a.altoma@antoniusziekenhuis.nl Open false False Safari iPhone iOS iPhone 2021-07-20 10:04:04 2021-07-20 17:22:53 NaN
26 NL|2354993|140133|2021-07-20 a.altoma@antoniusziekenhuis.nl Open false True Safari iPhone iOS iPhone 2021-07-20 10:04:04 2021-07-20 12:38:39 NaN
27 NL|2123412|115520|2021-04-08 a.b.francken@isala.nl Open false True Unspecified Unspecified Windows PC 2021-04-08 16:01:07 2021-04-08 21:48:41 NaN
28 NL|2108171|103847|2021-04-05 a.becker@vumc.nl Open false True Safari iPhone iOS iPhone 2021-04-05 13:00:38 2021-04-05 13:12:29 NaN
29 NL|2555383|103847|2021-10-14 a.becker@vumc.nl Open false True Safari iPhone iOS iPhone 2021-10-14 10:00:51 2021-10-14 14:03:47 NaN

Suman Leackage investigation

In [212]:
df_3_sub_sorted = df_3_sub.sort_values(by=['ME Action Email', 'SentId', 'Sent_Time', 'Open_Time'], axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False, key=None)
In [213]:
df_3_sub_sorted.head()
Out[213]:
SentId ME Action Email ME Event Type_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_Time Open_Time Click_Time
0 NL|2553841|134552|2021-10-17 81@xs4all.nl Open false True Safari iPhone iOS iPhone 2021-10-17 10:03:52 2021-10-17 10:07:54 NaN
1 NL|2585825|109216|2021-10-28 Apotheekdehamershof@xs4all.nl Open false True Unspecified Unspecified Unspecified Unspecified 2021-10-28 10:03:45 2021-10-28 11:13:59 NaN
2 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Open false True Firefox Unspecified Windows XP PC 2021-11-09 10:02:16 2021-11-09 10:50:17 NaN
3 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Open false False Firefox Unspecified Windows XP PC 2021-11-09 10:02:16 2021-11-09 16:26:59 NaN
4 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Click false True Android Android Android Unspecified 2021-11-09 10:02:16 2021-11-09 16:26:59 2021-11-09 10:51:24
In [214]:
df_3_sub_sorted.shape
Out[214]:
(23534, 12)
In [215]:
######### Seperate Click and Open ########
df_3_sub_sorted_Open = df_3_sub_sorted.loc[df_3_sub_sorted['ME Event Type_FROM_CLICK_OR_OPEN'].astype(str) == 'Open']
df_3_sub_sorted_Click = df_3_sub_sorted.loc[df_3_sub_sorted['ME Event Type_FROM_CLICK_OR_OPEN'].astype(str) == 'Click']
In [216]:
# Change values to datetime, create new columns for dif and change to seconds
df_3_sub_sorted_Open['Sent_Time'] = pd.to_datetime(df_3_sub_sorted_Open['Sent_Time']) 
df_3_sub_sorted_Open['Open_Time'] = pd.to_datetime(df_3_sub_sorted_Open['Open_Time'])
df_3_sub_sorted_Open['Click_Time'] = pd.to_datetime(df_3_sub_sorted_Open['Click_Time'])

df_3_sub_sorted_Open['Sent_to_Open'] = df_3_sub_sorted_Open['Open_Time'] - df_3_sub_sorted_Open['Sent_Time']
df_3_sub_sorted_Open['Sent_to_Open'] = df_3_sub_sorted_Open['Sent_to_Open']/np.timedelta64(1,'s')

df_3_sub_sorted_Open['Sent_to_Click'] = df_3_sub_sorted_Open['Click_Time'] - df_3_sub_sorted_Open['Sent_Time']
df_3_sub_sorted_Open['Sent_to_Click'] = df_3_sub_sorted_Open['Sent_to_Click']/np.timedelta64(1,'s')

df_3_sub_sorted_Open['Open_to_Click'] = df_3_sub_sorted_Open['Click_Time'] - df_3_sub_sorted_Open['Open_Time']
df_3_sub_sorted_Open['Open_to_Click'] = df_3_sub_sorted_Open['Open_to_Click']/np.timedelta64(1,'s')
In [217]:
# Change values to datetime, create new columns for dif and change to seconds
df_3_sub_sorted_Click['Sent_Time'] = pd.to_datetime(df_3_sub_sorted_Click['Sent_Time']) 
df_3_sub_sorted_Click['Open_Time'] = pd.to_datetime(df_3_sub_sorted_Click['Open_Time'])
df_3_sub_sorted_Click['Click_Time'] = pd.to_datetime(df_3_sub_sorted_Click['Click_Time'])

df_3_sub_sorted_Click['Sent_to_Open'] = df_3_sub_sorted_Click['Open_Time'] - df_3_sub_sorted_Click['Sent_Time']
df_3_sub_sorted_Click['Sent_to_Open'] = df_3_sub_sorted_Click['Sent_to_Open']/np.timedelta64(1,'s')

df_3_sub_sorted_Click['Sent_to_Click'] = df_3_sub_sorted_Click['Click_Time'] - df_3_sub_sorted_Click['Sent_Time']
df_3_sub_sorted_Click['Sent_to_Click'] = df_3_sub_sorted_Click['Sent_to_Click']/np.timedelta64(1,'s')

df_3_sub_sorted_Click['Open_to_Click'] = df_3_sub_sorted_Click['Click_Time'] - df_3_sub_sorted_Click['Open_Time']
df_3_sub_sorted_Click['Open_to_Click'] = df_3_sub_sorted_Click['Open_to_Click']/np.timedelta64(1,'s')
In [218]:
# df_3_sub['is_immidiate_Open']=df_3_sub.loc[df_3_sub['Sent_to_Open'] > 60] 
df_3_sub_sorted_Open['is_immidiate_Open']=np.where(df_3_sub_sorted_Open['Sent_to_Open'] < 30, 1, 0)#, df_3_sub['First Season']
df_3_sub_sorted_Open['is_immidiate_Click']=np.where(df_3_sub_sorted_Open['Open_to_Click'] < 3, 1, 0)
df_3_sub_sorted_Click['is_immidiate_Click']=np.where(df_3_sub_sorted_Click['Sent_to_Click'] < 3, 1, 0)
In [219]:
df_3_sub_sorted_Open.tail()
Out[219]:
SentId ME Action Email ME Event Type_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_Time Open_Time Click_Time Sent_to_Open Sent_to_Click Open_to_Click is_immidiate_Open is_immidiate_Click
23528 NL|2141027|148385|2021-04-15 zupanbk@gmail.com Open false False Safari iPhone iOS iPhone 2021-04-15 16:01:02 2021-04-15 21:03:02 NaT 18120.0 NaN NaN 0 0
23529 NL|2141027|148385|2021-04-15 zupanbk@gmail.com Open false True Safari iPhone iOS iPhone 2021-04-15 16:01:02 2021-04-15 21:03:02 NaT 18120.0 NaN NaN 0 0
23527 NL|2141027|148385|2021-04-15 zupanbk@gmail.com Open false False Safari iPhone iOS iPhone 2021-04-15 16:01:02 2021-04-15 21:04:27 NaT 18205.0 NaN NaN 0 0
23531 NL|2108596|148553|2021-04-06 zwelef@hotmail.com Open false True Unspecified Unspecified Windows PC 2021-04-06 10:00:47 2021-04-06 10:01:40 NaT 53.0 NaN NaN 0 0
23532 NL|2108596|148553|2021-04-06 zwelef@hotmail.com Open false False Unspecified Unspecified Windows PC 2021-04-06 10:00:47 2021-04-06 10:58:27 NaT 3460.0 NaN NaN 0 0
In [220]:
df_3_sub_sorted_Open.shape
Out[220]:
(17353, 17)
In [221]:
df_3_sub_sorted_Open.to_csv('df_3_sub_sorted_Open.csv')
In [222]:
df_3_sub_sorted_Open['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[222]:
false             16704
Bot_open_model      649
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [223]:
df_3_sub_sorted_Click.shape
Out[223]:
(6181, 16)
In [224]:
df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[224]:
false                        4333
Bot_click_model               929
Bot_click_Honeypot            470
Bot_click_Honeypot_around     449
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [225]:
df_3_sub_sorted_Open.columns
Out[225]:
Index(['SentId', 'ME Action Email', 'ME Event Type_FROM_CLICK_OR_OPEN',
       'BOT_ACTIVITY_FROM_CLICK_OR_OPEN', 'ME_Is_Unique_FROM_CLICK_OR_OPEN',
       'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
       'Sent_Time', 'Open_Time', 'Click_Time', 'Sent_to_Open', 'Sent_to_Click',
       'Open_to_Click', 'is_immidiate_Open', 'is_immidiate_Click'],
      dtype='object')

Leackage detection (unseen, emaile leackage,...)

Email leackage check

In [226]:
df_3_sub_sorted_Open_1 = df_3_sub_sorted_Open.iloc[:12000,:]
df_3_sub_sorted_Open_2 = df_3_sub_sorted_Open.iloc[12000:,:] ## Unseen
data_unseen_Open = df_3_sub_sorted_Open_2
In [227]:
df_3_sub_sorted_Open_1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[227]:
false             11578
Bot_open_model      422
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64

Unseen method 1

In [228]:
##### Unseen
df_3_sub_sorted_Open_2['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts() #### Unseen
Out[228]:
false             5126
Bot_open_model     227
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64

Unseen method 2

In [229]:
data = df_3_sub_sorted_Open.sample(frac = 0.95, random_state = 42)
data_unseen_random = df_3_sub_sorted_Open.drop(data.index)
data.reset_index(inplace = True, drop = True)
data_unseen_random.reset_index(inplace = True, drop = True)
In [230]:
data['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[230]:
false             15872
Bot_open_model      613
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [231]:
data_unseen_Open['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[231]:
false             5126
Bot_open_model     227
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [232]:
data_unseen_random['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[232]:
false             832
Bot_open_model     36
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [268]:
df_sub_sub.head(2)
Out[268]:
SentId ME Action Email ME Event Type_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_Time Open_Time Click_Time
0 NL|2553841|134552|2021-10-17 81@xs4all.nl Open -1 True Safari iPhone iOS iPhone 2021-10-17 10:03:52 2021-10-17 10:07:54 NaN
1 NL|2585825|109216|2021-10-28 Apotheekdehamershof@xs4all.nl Open -1 True Unspecified Unspecified Unspecified Unspecified 2021-10-28 10:03:45 2021-10-28 11:13:59 NaN
In [266]:
df_sub_sub = df_3_sub

df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)
df_3_sub_sorted_Open

## Unseen
# data = df_3_sub_sorted_Open.sample(frac=0.95, random_state=42)
# data_unseen = df_3_sub_sorted_Open.drop(data.index)
# data.reset_index(inplace=True, drop=True)
# data_unseen.reset_index(inplace=True, drop=True)

import pycaret
from pycaret.classification import *
exp_clf101 = setup(data = df_sub_sub, ignore_features=['ME Action Email', 'SentId', 'Sent_Time', 'Open_Time',   #df_3_sub_sorted_Open_1
                                                                 'Click_Time', 'ME Event Type_FROM_CLICK_OR_OPEN'
                                                                  ]      
                                                                 #'Sent_to_Open', 
                                                                 # 'is_immidiate_Open', 'is_immidiate_Click',
                                                                 #'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
                                                                 #'Open_to_Click','Sent_to_Click''ME_Is_Unique_FROM_CLICK_OR_OPEN', 
                                                               # 'ME OperatingSystem_FROM_CLICK_OR_OPEN'
                                                                , numeric_features=['Sent_to_Click', 'Open_to_Click'],
                    ignore_low_variance= True , combine_rare_levels= True, data_split_stratify = True, 
                   remove_multicollinearity= True,  use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #preprocess=False,, 
                                                                                                                                #fix_imbalance = True,test_data: pandas.DataFrame 
                                                                                                                                #'Sent_to_Open', 'Sent_to_Click', , 'is_immidiate_Open', 
                                                                                                                                #'is_immidiate_Click''Open_to_Click', profile= True, 
                                                                                                                                #high_cardinality_features=['emailclient', 'operatingsystem']
                                                                                                                                #, transformation= True,feature_selection= True,
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-266-573b91a90a40> in <module>
     28                                                                 , numeric_features=['Sent_to_Click', 'Open_to_Click'],
     29                     ignore_low_variance= True , combine_rare_levels= True, data_split_stratify = True,
---> 30                    remove_multicollinearity= True,  use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #preprocess=False,, 
     31                                                                                                                                 #fix_imbalance = True,test_data: pandas.DataFrame
     32                                                                                                                                 #'Sent_to_Open', 'Sent_to_Click', , 'is_immidiate_Open',

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/classification.py in setup(data, target, train_size, test_data, preprocess, imputation_type, iterative_imputation_iters, categorical_features, categorical_imputation, categorical_iterative_imputer, ordinal_features, high_cardinality_features, high_cardinality_method, numeric_features, numeric_imputation, numeric_iterative_imputer, date_features, ignore_features, normalize, normalize_method, transformation, transformation_method, handle_unknown_categorical, unknown_categorical_method, pca, pca_method, pca_components, ignore_low_variance, combine_rare_levels, rare_level_threshold, bin_numeric_features, remove_outliers, outliers_threshold, remove_multicollinearity, multicollinearity_threshold, remove_perfect_collinearity, create_clusters, cluster_iter, polynomial_features, polynomial_degree, trigonometry_features, polynomial_threshold, group_features, group_names, feature_selection, feature_selection_threshold, feature_selection_method, feature_interaction, feature_ratio, interaction_threshold, fix_imbalance, fix_imbalance_method, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, custom_pipeline, html, session_id, log_experiment, experiment_name, log_plots, log_profile, log_data, silent, verbose, profile, profile_kwargs)
    653         verbose=verbose,
    654         profile=profile,
--> 655         profile_kwargs=profile_kwargs,
    656     )
    657 

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/internal/tabular.py in setup(data, target, ml_usecase, available_plots, train_size, test_data, preprocess, imputation_type, iterative_imputation_iters, categorical_features, categorical_imputation, categorical_iterative_imputer, ordinal_features, high_cardinality_features, high_cardinality_method, numeric_features, numeric_imputation, numeric_iterative_imputer, date_features, ignore_features, normalize, normalize_method, transformation, transformation_method, handle_unknown_categorical, unknown_categorical_method, pca, pca_method, pca_components, ignore_low_variance, combine_rare_levels, rare_level_threshold, bin_numeric_features, remove_outliers, outliers_threshold, remove_multicollinearity, multicollinearity_threshold, remove_perfect_collinearity, create_clusters, cluster_iter, polynomial_features, polynomial_degree, trigonometry_features, polynomial_threshold, group_features, group_names, feature_selection, feature_selection_threshold, feature_selection_method, feature_interaction, feature_ratio, interaction_threshold, fix_imbalance, fix_imbalance_method, transform_target, transform_target_method, data_split_shuffle, data_split_stratify, fold_strategy, fold, fold_shuffle, fold_groups, n_jobs, use_gpu, custom_pipeline, html, session_id, log_experiment, experiment_name, log_plots, log_profile, log_data, silent, verbose, profile, profile_kwargs, display)
    628             if i not in all_cols:
    629                 raise ValueError(
--> 630                     "Column type forced is either target column or doesn't exist in the dataset."
    631                 )
    632 

ValueError: Column type forced is either target column or doesn't exist in the dataset.
In [238]:
best_model_Open = compare_models()
Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
ada Ada Boost Classifier 0.9887 0.9967 0.9904 0.9979 0.9941 0.8485 0.8528 0.2620
gbc Gradient Boosting Classifier 0.9877 0.9965 0.9900 0.9973 0.9936 0.8351 0.8391 0.5680
knn K Neighbors Classifier 0.9871 0.9961 0.9909 0.9958 0.9933 0.8214 0.8239 0.4450
dt Decision Tree Classifier 0.9870 0.9866 0.9904 0.9962 0.9933 0.8219 0.8245 0.0150
lightgbm Light Gradient Boosting Machine 0.9870 0.9962 0.9909 0.9957 0.9933 0.8195 0.8216 0.0840
rf Random Forest Classifier 0.9867 0.9965 0.9910 0.9952 0.9931 0.8130 0.8146 0.9170
et Extra Trees Classifier 0.9864 0.9914 0.9900 0.9959 0.9929 0.8137 0.8165 0.9130
lr Logistic Regression 0.9862 0.9958 0.9909 0.9948 0.9928 0.8055 0.8069 0.1770
svm SVM - Linear Kernel 0.9829 0.0000 0.9877 0.9945 0.9911 0.7674 0.7715 0.3710
nb Naive Bayes 0.9827 0.9927 0.9821 1.0000 0.9910 0.7958 0.8134 0.0150
ridge Ridge Classifier 0.9770 0.0000 0.9804 0.9958 0.9880 0.7147 0.7303 0.0250
lda Linear Discriminant Analysis 0.9705 0.9897 0.9694 1.0000 0.9845 0.6914 0.7271 0.0730
qda Quadratic Discriminant Analysis 0.9649 0.5000 1.0000 0.9649 0.9821 0.0000 0.0000 0.0340
In [58]:
# gbc = create_model('gbc')
# tuned_gbc = tune_model(gbc)
In [258]:
knn = create_model('knn')
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9644 0.9460 0.9868 0.9740 0.9804 0.7869 0.7889
1 0.9407 0.9355 0.9671 0.9671 0.9671 0.6641 0.6641
2 0.9435 0.9821 0.9638 0.9734 0.9686 0.6851 0.6859
3 0.9613 0.9849 0.9736 0.9833 0.9784 0.7901 0.7910
4 0.9524 0.9829 0.9769 0.9705 0.9737 0.7237 0.7241
5 0.9673 0.9841 0.9802 0.9834 0.9818 0.8176 0.8178
6 0.9554 0.9425 0.9736 0.9768 0.9752 0.7513 0.7514
7 0.9643 0.9707 0.9835 0.9770 0.9803 0.7928 0.7933
8 0.9702 0.9941 0.9802 0.9867 0.9834 0.8364 0.8368
9 0.9405 0.9483 0.9538 0.9797 0.9666 0.6967 0.7019
Mean 0.9560 0.9671 0.9740 0.9772 0.9756 0.7545 0.7555
SD 0.0107 0.0206 0.0095 0.0059 0.0060 0.0564 0.0560
In [259]:
print(knn)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')
In [261]:
plot_model(knn, plot = 'auc')
In [257]:
print(best_model_Open)
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=123)
In [ ]:
 
In [239]:
#unseen_result = predict_model(best_model_Open, data = data_unseen)
unseen_result = predict_model(best_model_Open, data = data_unseen_Open)
unseen_result.head()
Out[239]:
SentId ME Action Email ME Event Type_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_Time Open_Time Click_Time Sent_to_Open Sent_to_Click Open_to_Click is_immidiate_Open is_immidiate_Click Label Score
16499 NL|2585824|81484|2021-10-28 michelledegroen@gmail.com Open false False Safari iPhone iOS iPhone 2021-10-28 10:03:55 2021-10-28 21:57:16 NaT 42801.0 NaN NaN 0 0 false 0.8163
16504 NL|2585824|81484|2021-10-28 michelledegroen@gmail.com Open false False Safari iPhone iOS iPhone 2021-10-28 10:03:55 2021-10-28 21:57:17 NaT 42802.0 NaN NaN 0 0 false 0.8163
16501 NL|2585824|81484|2021-10-28 michelledegroen@gmail.com Open false False Safari iPhone iOS iPhone 2021-10-28 10:03:55 2021-10-28 21:57:52 NaT 42837.0 NaN NaN 0 0 false 0.8163
16500 NL|2585837|81484|2021-10-31 michelledegroen@gmail.com Open false False Safari iPhone iOS iPhone 2021-10-31 10:02:01 2021-10-31 12:02:17 NaT 7216.0 NaN NaN 0 0 false 0.8163
16510 NL|2585837|81484|2021-10-31 michelledegroen@gmail.com Open false True Safari iPhone iOS iPhone 2021-10-31 10:02:01 2021-10-31 12:02:17 NaT 7216.0 NaN NaN 0 0 false 0.8117
In [240]:
confusion_matrix = pd.crosstab(unseen_result['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], unseen_result['Label'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)
Predicted       Bot_open_model  false
Actual                               
Bot_open_model             220      7
false                       58   5068
In [241]:
# FI imp
plot_model(gbc, plot='feature')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-241-80455df8a345> in <module>
      1 # FI imp
----> 2 plot_model(gbc, plot='feature')

NameError: name 'gbc' is not defined
In [ ]:
 
In [ ]:
 

Leackage on Click

In [242]:
df_3_sub_sorted_Click.shape
Out[242]:
(6181, 16)
In [243]:
df_3_sub_sorted_Click.head()
Out[243]:
SentId ME Action Email ME Event Type_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_Time Open_Time Click_Time Sent_to_Open Sent_to_Click Open_to_Click is_immidiate_Click
4 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Click false True Android Android Android Unspecified 2021-11-09 10:02:16 2021-11-09 16:26:59 2021-11-09 10:51:24 23083.0 2948.0 -20135.0 0
5 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Click false False Android Android Android Unspecified 2021-11-09 10:02:16 2021-11-09 16:26:59 2021-11-09 10:51:57 23083.0 2981.0 -20102.0 0
10 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Click false True Unspecified Unspecified Windows PC 2021-10-14 10:04:04 2021-10-14 13:40:53 2021-10-14 10:56:37 13009.0 3153.0 -9856.0 0
11 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Click false False Unspecified Unspecified Windows PC 2021-10-14 10:04:04 2021-10-14 13:40:53 2021-10-14 10:57:28 13009.0 3204.0 -9805.0 0
13 NL|2354993|114148|2021-07-20 Lammy.elving@radboudumc.nl Click false True Unspecified Unspecified Unspecified Unspecified 2021-07-20 10:03:50 2021-07-20 12:02:09 2021-07-20 12:04:15 7099.0 7225.0 126.0 0
In [244]:
df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[244]:
false                        4333
Bot_click_model               929
Bot_click_Honeypot            470
Bot_click_Honeypot_around     449
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [245]:
df_3_sub_sorted_Click_NoGrmn['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[245]:
false                 4333
Bot_click_Honeypot     470
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [246]:
df_3_sub_sorted_Click_NoGrmn_1 = df_3_sub_sorted_Click_NoGrmn.iloc[:4000,:]
df_3_sub_sorted_Click_NoGrmn_2 = df_3_sub_sorted_Click_NoGrmn.iloc[4000:,:] ## Unseen
data_unseen_Click = df_3_sub_sorted_Click_NoGrmn_2
In [247]:
df_3_sub_sorted_Click_NoGrmn_1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[247]:
false                 3592
Bot_click_Honeypot     408
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [248]:
df_3_sub_sorted_Click_NoGrmn_2['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[248]:
false                 741
Bot_click_Honeypot     62
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [249]:
df_3_sub_sorted_Click_NoGrmn.head()
Out[249]:
SentId ME Action Email ME Event Type_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_Time Open_Time Click_Time Sent_to_Open Sent_to_Click Open_to_Click is_immidiate_Click
4 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Click false True Android Android Android Unspecified 2021-11-09 10:02:16 2021-11-09 16:26:59 2021-11-09 10:51:24 23083.0 2948.0 -20135.0 0
5 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Click false False Android Android Android Unspecified 2021-11-09 10:02:16 2021-11-09 16:26:59 2021-11-09 10:51:57 23083.0 2981.0 -20102.0 0
10 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Click false True Unspecified Unspecified Windows PC 2021-10-14 10:04:04 2021-10-14 13:40:53 2021-10-14 10:56:37 13009.0 3153.0 -9856.0 0
11 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Click false False Unspecified Unspecified Windows PC 2021-10-14 10:04:04 2021-10-14 13:40:53 2021-10-14 10:57:28 13009.0 3204.0 -9805.0 0
13 NL|2354993|114148|2021-07-20 Lammy.elving@radboudumc.nl Click false True Unspecified Unspecified Unspecified Unspecified 2021-07-20 10:03:50 2021-07-20 12:02:09 2021-07-20 12:04:15 7099.0 7225.0 126.0 0

Click without German

In [ ]:
 
In [251]:
df_sub_sub = df_3_sub

df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_3_sub_sorted_Click_NoGrmn = df_3_sub_sorted_Click.loc[df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_3_sub_sorted_Click_NoGrmn = df_3_sub_sorted_Click_NoGrmn.loc[df_3_sub_sorted_Click_NoGrmn['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_Honeypot_around']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)


## Unseen
# data_Click = df_3_sub_sorted_Click_NoGrmn.sample(frac=0.95, random_state=42)
# data_unseen_Click = df_3_sub_sorted_Click_NoGrmn.drop(data_Click.index)
# data_Click.reset_index(inplace=True, drop=True)
# data_unseen_Click.reset_index(inplace=True, drop=True)

import pycaret
from pycaret.classification import *
exp_clf101 = setup(data = df_3_sub_sorted_Click_NoGrmn, ignore_features=['ME Action Email', 
                                                       'SentId', 'Sent_Time', 'Open_Time', 
                                                       'Click_Time', 'ME Event Type_FROM_CLICK_OR_OPEN' 
                                                     ], # 'Open_to_Click', 'Sent_to_Open', 'is_immidiate_Click','Sent_to_Click'numeric_features=['Sent_to_Click', 'Open_to_Click'],
                    ignore_low_variance= True , combine_rare_levels= True, data_split_stratify = True,
                   remove_multicollinearity= True,  use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #, profile= True, high_cardinality_features=['emailclient', 'operatingsystem'], transformation= True,feature_selection= True,
Description Value
0 session_id 123
1 Target BOT_ACTIVITY_FROM_CLICK_OR_OPEN
2 Target Type Binary
3 Label Encoded Bot_click_Honeypot: 0, false: 1
4 Original Data (4803, 16)
5 Missing Values False
6 Numeric Features 3
7 Categorical Features 6
8 Ordinal Features False
9 High Cardinality Features False
10 High Cardinality Method None
11 Transformed Train Set (3362, 23)
12 Transformed Test Set (1441, 23)
13 Shuffle Train-Test True
14 Stratify Train-Test True
15 Fold Generator StratifiedKFold
16 Fold Number 10
17 CPU Jobs -1
18 Use GPU True
19 Log Experiment False
20 Experiment Name clf-default-name
21 USI 1165
22 Imputation Type simple
23 Iterative Imputation Iteration None
24 Numeric Imputer mean
25 Iterative Imputation Numeric Model None
26 Categorical Imputer constant
27 Iterative Imputation Categorical Model None
28 Unknown Categoricals Handling least_frequent
29 Normalize False
30 Normalize Method None
31 Transformation False
32 Transformation Method None
33 PCA False
34 PCA Method None
35 PCA Components None
36 Ignore Low Variance True
37 Combine Rare Levels True
38 Rare Level Threshold 0.100000
39 Numeric Binning False
40 Remove Outliers False
41 Outliers Threshold None
42 Remove Multicollinearity True
43 Multicollinearity Threshold 0.900000
44 Remove Perfect Collinearity True
45 Clustering False
46 Clustering Iteration None
47 Polynomial Features False
48 Polynomial Degree None
49 Trignometry Features False
50 Polynomial Threshold None
51 Group Features False
52 Feature Selection False
53 Feature Selection Method classic
54 Features Selection Threshold None
55 Feature Interaction False
56 Feature Ratio False
57 Interaction Threshold None
58 Fix Imbalance False
59 Fix Imbalance Method SMOTE
In [252]:
best_model_Click = compare_models()
Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
et Extra Trees Classifier 0.9768 0.9771 0.9875 0.9868 0.9871 0.8685 0.8691 0.8990
rf Random Forest Classifier 0.9765 0.9881 0.9888 0.9853 0.9870 0.8643 0.8653 0.9030
lightgbm Light Gradient Boosting Machine 0.9726 0.9862 0.9868 0.9829 0.9849 0.8423 0.8429 0.1150
dt Decision Tree Classifier 0.9706 0.9295 0.9805 0.9868 0.9836 0.8375 0.8384 0.0160
gbc Gradient Boosting Classifier 0.9655 0.9819 0.9855 0.9765 0.9810 0.7977 0.7998 0.3590
knn K Neighbors Classifier 0.9560 0.9671 0.9740 0.9772 0.9756 0.7545 0.7555 0.0950
ada Ada Boost Classifier 0.9417 0.9705 0.9779 0.9584 0.9680 0.6365 0.6438 0.1730
lda Linear Discriminant Analysis 0.9066 0.7365 0.9848 0.9177 0.9501 0.2423 0.2878 0.0380
lr Logistic Regression 0.9033 0.6935 0.9944 0.9074 0.9489 0.0933 0.1423 0.0930
ridge Ridge Classifier 0.9021 0.0000 1.0000 0.9021 0.9486 0.0000 0.0000 0.0210
svm SVM - Linear Kernel 0.8132 0.0000 0.8885 0.9044 0.8790 -0.0019 -0.0013 0.0230
nb Naive Bayes 0.5982 0.8113 0.5631 0.9850 0.7161 0.1762 0.2883 0.0120
qda Quadratic Discriminant Analysis 0.0979 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0230
In [161]:
# FI imp
plot_model(best_model_Click, plot='feature')
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
In [162]:
#unseen_result = predict_model(best_model_Open, data = data_unseen)
unseen_result_Click = predict_model(best_model_Click, data = data_unseen_Click)
unseen_result_Click.head(60)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-162-e4bbe32381c1> in <module>
      1 #unseen_result = predict_model(best_model_Open, data = data_unseen)
----> 2 unseen_result_Click = predict_model(best_model_Click, data = data_unseen_Click)
      3 unseen_result_Click.head(60)

NameError: name 'data_unseen_Click' is not defined
In [163]:
unseen_result_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts() 
unseen_result_Click['Label'].value_counts()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-163-b6953a62dabc> in <module>
----> 1 unseen_result_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
      2 unseen_result_Click['Label'].value_counts()

NameError: name 'unseen_result_Click' is not defined
In [164]:
confusion_matrix = pd.crosstab(unseen_result_Click['Label'], unseen_result_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix) 
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-164-d053bfe7bfca> in <module>
----> 1 confusion_matrix = pd.crosstab(unseen_result_Click['Label'], unseen_result_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
      2 print (confusion_matrix)

NameError: name 'unseen_result_Click' is not defined
In [ ]:
 
In [165]:
#df_sub_sub = df_3_sub

df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', -1)
df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model', 1)

import pycaret
from pycaret.classification import *
exp_clf101 = setup(data = df_3_sub_sorted_Click, ignore_features=['SentId', 'Sent_Time', 'Open_Time', 'Click_Time', 'ME Event Type_FROM_CLICK_OR_OPEN'], #numeric_features=['Sent_to_Click', 'Open_to_Click'],
                    ignore_low_variance= True , combine_rare_levels= True,
                   remove_multicollinearity= True,  use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #, profile= True, high_cardinality_features=['emailclient', 'operatingsystem'], transformation= True,feature_selection= True,
Description Value
0 session_id 123
1 Target BOT_ACTIVITY_FROM_CLICK_OR_OPEN
2 Target Type Binary
3 Label Encoded None
4 Original Data (6181, 16)
5 Missing Values False
6 Numeric Features 3
7 Categorical Features 7
8 Ordinal Features False
9 High Cardinality Features False
10 High Cardinality Method None
11 Transformed Train Set (4326, 74)
12 Transformed Test Set (1855, 74)
13 Shuffle Train-Test True
14 Stratify Train-Test False
15 Fold Generator StratifiedKFold
16 Fold Number 10
17 CPU Jobs -1
18 Use GPU True
19 Log Experiment False
20 Experiment Name clf-default-name
21 USI 252c
22 Imputation Type simple
23 Iterative Imputation Iteration None
24 Numeric Imputer mean
25 Iterative Imputation Numeric Model None
26 Categorical Imputer constant
27 Iterative Imputation Categorical Model None
28 Unknown Categoricals Handling least_frequent
29 Normalize False
30 Normalize Method None
31 Transformation False
32 Transformation Method None
33 PCA False
34 PCA Method None
35 PCA Components None
36 Ignore Low Variance True
37 Combine Rare Levels True
38 Rare Level Threshold 0.100000
39 Numeric Binning False
40 Remove Outliers False
41 Outliers Threshold None
42 Remove Multicollinearity True
43 Multicollinearity Threshold 0.900000
44 Remove Perfect Collinearity True
45 Clustering False
46 Clustering Iteration None
47 Polynomial Features False
48 Polynomial Degree None
49 Trignometry Features False
50 Polynomial Threshold None
51 Group Features False
52 Feature Selection False
53 Feature Selection Method classic
54 Features Selection Threshold None
55 Feature Interaction False
56 Feature Ratio False
57 Interaction Threshold None
58 Fix Imbalance False
59 Fix Imbalance Method SMOTE
In [166]:
best_model_Click_2 = compare_models()
Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
rf Random Forest Classifier 0.9184 0.9647 0.8764 0.8573 0.8661 0.8075 0.8082 1.0190
dt Decision Tree Classifier 0.9149 0.9138 0.8849 0.8417 0.8622 0.8008 0.8018 0.0330
et Extra Trees Classifier 0.9122 0.9462 0.8680 0.8461 0.8561 0.7930 0.7938 1.0410
lightgbm Light Gradient Boosting Machine 0.9029 0.9609 0.8096 0.8614 0.8343 0.7657 0.7668 0.1150
gbc Gradient Boosting Classifier 0.8696 0.9246 0.7114 0.8336 0.7671 0.6773 0.6819 0.6250
knn K Neighbors Classifier 0.8648 0.9321 0.7997 0.7633 0.7809 0.6832 0.6838 0.3220
ada Ada Boost Classifier 0.8449 0.9039 0.7083 0.7624 0.7335 0.6244 0.6260 0.2430
ridge Ridge Classifier 0.8111 0.0000 0.5778 0.7392 0.6477 0.5215 0.5295 0.0290
lda Linear Discriminant Analysis 0.8109 0.8789 0.6039 0.7232 0.6574 0.5285 0.5331 0.0950
lr Logistic Regression 0.7293 0.6931 0.1719 0.7073 0.2577 0.1732 0.2388 0.1210
nb Naive Bayes 0.7259 0.7592 0.2225 0.6295 0.3279 0.2023 0.2460 0.0150
qda Quadratic Discriminant Analysis 0.7085 0.5179 0.0384 0.9248 0.0712 0.0485 0.1395 0.0410
svm SVM - Linear Kernel 0.6165 0.0000 0.3190 0.1832 0.2285 0.0574 0.0591 0.0970
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [534]:
#scatterplot
sns.set()
cols = ['Open_to_Click', 'Sent_to_Open', 'is_immidiate_Click',
        'Sent_to_Click', 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN']#'BOT_ACTIVITY_x', 'BOT_ACTIVITY_y', 'ME_Is_Unique_x'
sns.pairplot(df_3_sub_sorted_Click_NoGrmn[cols], size = 2.5)
plt.show();
In [541]:
g= sns.pairplot(df_3_sub_sorted_Click_NoGrmn_corr, hue = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN', diag_kind= 'hist',
             vars=df_3_sub_sorted_Click_NoGrmn_corr.columns[:-1],
             plot_kws=dict(alpha=0.5), 
             diag_kws=dict(alpha=0.5))
plt.show()
In [540]:
df_3_sub_sorted_Click_NoGrmn_corr=df_3_sub_sorted_Click_NoGrmn[['BOT_ACTIVITY_FROM_CLICK_OR_OPEN', 
       'Sent_to_Open', 'Sent_to_Click',
       'Open_to_Click']]#'ME_Is_Unique_FROM_CLICK_OR_OPEN',
       #'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
       #'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
In [538]:
df_3_sub_sorted_Click_NoGrmn_corr.columns[:-1]
Out[538]:
Index(['BOT_ACTIVITY_FROM_CLICK_OR_OPEN', 'ME_Is_Unique_FROM_CLICK_OR_OPEN',
       'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
       'Sent_to_Open', 'Sent_to_Click'],
      dtype='object')
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [74]:
df_3_sub = df_3[['SentId','ME Action Email', 'ME Event Type_FROM_CLICK_OR_OPEN',
       'BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_FROM_CLICK_OR_OPEN', 
       'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN', 'Sent_Time', 'Open_Time', 'Click_Time']]
df_3_sub.head(30)
Out[74]:
SentId ME Action Email ME Event Type_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_Time Open_Time Click_Time
0 NL|2553841|134552|2021-10-17 81@xs4all.nl Open false True Safari iPhone iOS iPhone 2021-10-17 10:03:52 2021-10-17 10:07:54 NaN
1 NL|2585825|109216|2021-10-28 Apotheekdehamershof@xs4all.nl Open false True Unspecified Unspecified Unspecified Unspecified 2021-10-28 10:03:45 2021-10-28 11:13:59 NaN
2 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Open false True Firefox Unspecified Windows XP PC 2021-11-09 10:02:16 2021-11-09 10:50:17 NaN
3 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Open false False Firefox Unspecified Windows XP PC 2021-11-09 10:02:16 2021-11-09 16:26:59 NaN
4 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Click false True Android Android Android Unspecified 2021-11-09 10:02:16 2021-11-09 16:26:59 2021-11-09 10:51:24
5 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Click false False Android Android Android Unspecified 2021-11-09 10:02:16 2021-11-09 16:26:59 2021-11-09 10:51:57
6 NL|2553837|130706|2021-10-17 HDOEI@TELENET.BE Open false True Safari iPhone iOS iPhone 2021-10-17 10:03:43 2021-10-17 20:34:23 NaN
7 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Open false True Unspecified Unspecified Unspecified Unspecified 2021-10-14 10:04:04 2021-10-14 10:56:22 NaN
8 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Open false False Unspecified Unspecified Unspecified Unspecified 2021-10-14 10:04:04 2021-10-14 11:27:09 NaN
9 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Open false False Unspecified Unspecified Unspecified Unspecified 2021-10-14 10:04:04 2021-10-14 13:40:53 NaN
10 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Click false True Unspecified Unspecified Windows PC 2021-10-14 10:04:04 2021-10-14 13:40:53 2021-10-14 10:56:37
11 NL|2553835|130706|2021-10-14 HDOEI@TELENET.BE Click false False Unspecified Unspecified Windows PC 2021-10-14 10:04:04 2021-10-14 13:40:53 2021-10-14 10:57:28
12 NL|2354993|114148|2021-07-20 Lammy.elving@radboudumc.nl Open false True Safari iPhone iOS iPhone 2021-07-20 10:03:50 2021-07-20 12:02:09 NaN
13 NL|2354993|114148|2021-07-20 Lammy.elving@radboudumc.nl Click false True Unspecified Unspecified Unspecified Unspecified 2021-07-20 10:03:50 2021-07-20 12:02:09 2021-07-20 12:04:15
14 NL|2553835|88990|2021-10-14 Marleen.Vosjan@wza.nl Click false True Unspecified Unspecified Windows PC 2021-10-14 10:04:04 2021-10-14 14:07:28 2021-10-14 14:07:34
15 NL|2553835|88990|2021-10-14 Marleen.Vosjan@wza.nl Open false True Unspecified Unspecified Windows Unspecified 2021-10-14 10:04:04 2021-10-14 14:07:28 NaN
16 NL|2540669|109737|2021-10-10 OPVOS@NEDERFOX.COM Click false True Android Android Android Unspecified 2021-10-10 10:02:55 2021-10-10 11:37:34 2021-10-10 11:29:31
17 NL|2540669|109737|2021-10-10 OPVOS@NEDERFOX.COM Open false True Android Android Android Unspecified 2021-10-10 10:02:55 2021-10-10 11:37:34 NaN
18 NL|2536355|102563|2021-10-07 a.ali@apotheekmonster.nl Open false True Unspecified Thunderbird Windows PC 2021-10-07 10:02:04 2021-10-07 10:15:42 NaN
19 NL|2105290|102563|2021-03-31 a.ali@apotheekmonster.nl Open false True Unspecified Thunderbird Windows PC 2021-03-31 10:00:59 2021-03-31 12:11:21 NaN
20 NL|2536370|102563|2021-10-13 a.ali@apotheekmonster.nl Open false True Unspecified Thunderbird Windows PC 2021-10-13 10:02:54 2021-10-13 10:05:27 NaN
21 NL|2585825|102563|2021-10-28 a.ali@apotheekmonster.nl Open false True Unspecified Thunderbird Windows PC 2021-10-28 10:03:46 2021-10-28 10:05:39 NaN
22 NL|2354993|102563|2021-07-20 a.ali@apotheekmonster.nl Open false False Unspecified Apple Mail 1 Mac OS X 10.1 Macintosh 2021-07-20 10:04:08 2021-07-20 13:28:45 NaN
23 NL|2354993|102563|2021-07-20 a.ali@apotheekmonster.nl Open false False Unspecified Apple Mail 1 Mac OS X 10.1 Macintosh 2021-07-20 10:04:08 2021-07-20 13:26:09 NaN
24 NL|2354993|102563|2021-07-20 a.ali@apotheekmonster.nl Open false True Unspecified Apple Mail 1 Mac OS X 10.1 Macintosh 2021-07-20 10:04:08 2021-07-20 13:26:08 NaN
25 NL|2354993|140133|2021-07-20 a.altoma@antoniusziekenhuis.nl Open false False Safari iPhone iOS iPhone 2021-07-20 10:04:04 2021-07-20 17:22:53 NaN
26 NL|2354993|140133|2021-07-20 a.altoma@antoniusziekenhuis.nl Open false True Safari iPhone iOS iPhone 2021-07-20 10:04:04 2021-07-20 12:38:39 NaN
27 NL|2123412|115520|2021-04-08 a.b.francken@isala.nl Open false True Unspecified Unspecified Windows PC 2021-04-08 16:01:07 2021-04-08 21:48:41 NaN
28 NL|2108171|103847|2021-04-05 a.becker@vumc.nl Open false True Safari iPhone iOS iPhone 2021-04-05 13:00:38 2021-04-05 13:12:29 NaN
29 NL|2555383|103847|2021-10-14 a.becker@vumc.nl Open false True Safari iPhone iOS iPhone 2021-10-14 10:00:51 2021-10-14 14:03:47 NaN
In [274]:
# Change values to datetime, create new columns for dif and change to seconds
df_3_sub['Sent_Time'] = pd.to_datetime(df_3_sub['Sent_Time']) 
df_3_sub['Open_Time'] = pd.to_datetime(df_3_sub['Open_Time'])
df_3_sub['Click_Time'] = pd.to_datetime(df_3_sub['Click_Time'])

df_3_sub['Sent_to_Open'] = df_3_sub['Open_Time'] - df_3_sub['Sent_Time']
df_3_sub['Sent_to_Open'] = df_3_sub['Sent_to_Open']/np.timedelta64(1,'s')

df_3_sub['Sent_to_Click'] = df_3_sub['Click_Time'] - df_3_sub['Sent_Time']
df_3_sub['Sent_to_Click'] = df_3_sub['Sent_to_Click']/np.timedelta64(1,'s')

df_3_sub['Open_to_Click'] = df_3_sub['Click_Time'] - df_3_sub['Open_Time']
df_3_sub['Open_to_Click'] = df_3_sub['Open_to_Click']/np.timedelta64(1,'s')
In [275]:
df_3_sub.head()
Out[275]:
SentId ME Action Email ME Event Type_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_Time Open_Time Click_Time Sent_to_Open Sent_to_Click Open_to_Click
0 NL|2553841|134552|2021-10-17 81@xs4all.nl Open false True Safari iPhone iOS iPhone 2021-10-17 10:03:52 2021-10-17 10:07:54 NaT 242.0 NaN NaN
1 NL|2585825|109216|2021-10-28 Apotheekdehamershof@xs4all.nl Open false True Unspecified Unspecified Unspecified Unspecified 2021-10-28 10:03:45 2021-10-28 11:13:59 NaT 4214.0 NaN NaN
2 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Open false True Firefox Unspecified Windows XP PC 2021-11-09 10:02:16 2021-11-09 10:50:17 NaT 2881.0 NaN NaN
3 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Open false False Firefox Unspecified Windows XP PC 2021-11-09 10:02:16 2021-11-09 16:26:59 NaT 23083.0 NaN NaN
4 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Click false True Android Android Android Unspecified 2021-11-09 10:02:16 2021-11-09 16:26:59 2021-11-09 10:51:24 23083.0 2948.0 -20135.0

Add reference time

In [276]:
df_3_sub['reference_time']= '2021-03-01 00:00:00'
#df_3_sub['ME Event Date_FROM_CLICK_OR_OPEN'] = pd.to_datetime(df_sub['ME Event Date_FROM_CLICK_OR_OPEN'])
df_3_sub['reference_time'] = pd.to_datetime(df_3_sub['reference_time'])

df_3_sub['reference_time_to_Open']= df_3_sub['Open_Time'] - df_3_sub['reference_time']
df_3_sub['reference_time_to_Open']= df_3_sub['reference_time_to_Open']/np.timedelta64(1,'s')

df_3_sub['reference_time_to_Click']= df_3_sub['Click_Time'] - df_3_sub['reference_time']
df_3_sub['reference_time_to_Click']= df_3_sub['reference_time_to_Click']/np.timedelta64(1,'s')

df_3_sub['reference_time_to_Sent']= df_3_sub['Sent_Time'] - df_3_sub['reference_time']
df_3_sub['reference_time_to_Sent']= df_3_sub['reference_time_to_Sent']/np.timedelta64(1,'s')

Add is immediate open and click

In [277]:
# df_3_sub['is_immidiate_Open']=df_3_sub.loc[df_3_sub['Sent_to_Open'] > 60] 
df_3_sub['is_immidiate_Open']=np.where(df_3_sub['Sent_to_Open'] < 30, 1, 0)#, df_3_sub['First Season']
df_3_sub['is_immidiate_Click']=np.where(df_3_sub['Open_to_Click'] < 3, 1, 0)
In [278]:
df_3_sub.head(3)
Out[278]:
SentId ME Action Email ME Event Type_FROM_CLICK_OR_OPEN BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_Time ... Click_Time Sent_to_Open Sent_to_Click Open_to_Click reference_time reference_time_to_Open reference_time_to_Click reference_time_to_Sent is_immidiate_Open is_immidiate_Click
0 NL|2553841|134552|2021-10-17 81@xs4all.nl Open false True Safari iPhone iOS iPhone 2021-10-17 10:03:52 ... NaT 242.0 NaN NaN 2021-03-01 19908474.0 NaN 19908232.0 0 0
1 NL|2585825|109216|2021-10-28 Apotheekdehamershof@xs4all.nl Open false True Unspecified Unspecified Unspecified Unspecified 2021-10-28 10:03:45 ... NaT 4214.0 NaN NaN 2021-03-01 20862839.0 NaN 20858625.0 0 0
2 NL|2585841|83606|2021-11-09 Arbharos@gmail.com Open false True Firefox Unspecified Windows XP PC 2021-11-09 10:02:16 ... NaT 2881.0 NaN NaN 2021-03-01 21898217.0 NaN 21895336.0 0 0

3 rows × 21 columns

In [279]:
df_3_sub = df_3_sub[['ME Action Email',
       'BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_FROM_CLICK_OR_OPEN', 
       'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN', 
       'Sent_to_Open', 'Sent_to_Click', 'Open_to_Click','reference_time_to_Open','reference_time_to_Click','reference_time_to_Sent']]#,'is_immidiate_Open''is_immidiate_Click', , 'Sent_Time','Open_Time','Click_Time'
       
df_3_sub.head()
Out[279]:
ME Action Email BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_to_Open Sent_to_Click Open_to_Click reference_time_to_Open reference_time_to_Click reference_time_to_Sent
0 81@xs4all.nl false True Safari iPhone iOS iPhone 242.0 NaN NaN 19908474.0 NaN 19908232.0
1 Apotheekdehamershof@xs4all.nl false True Unspecified Unspecified Unspecified Unspecified 4214.0 NaN NaN 20862839.0 NaN 20858625.0
2 Arbharos@gmail.com false True Firefox Unspecified Windows XP PC 2881.0 NaN NaN 21898217.0 NaN 21895336.0
3 Arbharos@gmail.com false False Firefox Unspecified Windows XP PC 23083.0 NaN NaN 21918419.0 NaN 21895336.0
4 Arbharos@gmail.com false True Android Android Android Unspecified 23083.0 2948.0 -20135.0 21918419.0 21898284.0 21895336.0
In [81]:
# d=df_3_sub.dropna()
# d.shape
# d['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
In [82]:
plt.figure(figsize=(8,6))
plt.hist(df_3_sub['Sent_to_Open'], bins=30, alpha=0.55, label="sent_to_Open", color='purple')
plt.hist(df_3_sub['Sent_to_Click'], bins=90, alpha=0.99, label="sent_to_click", color='black')
plt.hist(df_3_sub['Open_to_Click'], bins=90, alpha=0.5, label="open_to_click", color='yellow')

plt.xlabel("time", size=14)
plt.ylabel("Count", size=14)
plt.title("Sent to Open/Click")
plt.legend(loc='upper right')
plt.savefig("overlapping_histograms_with_matplotlib_Python_2.png")
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
In [280]:
df_3_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[280]:
false                        21037
Bot_click_model                929
Bot_open_model                 649
Bot_click_Honeypot             470
Bot_click_Honeypot_around      449
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [281]:
### Just keep honeypot labels - O, 1 ###

df_sub_sub1 = df_3_sub 
#[['BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
#        'ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
#        'ME Email Client_FROM_CLICK_OR_OPEN',
#        'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN'
#        ]]#,'Sent_to_OpenClick', 'refrence_time_to_OpenClick'

df_sub_sub1 = df_sub_sub1.loc[df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_sub_sub1 = df_sub_sub1.loc[df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', 0)
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
# df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',1)
# df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',1)


######### Pycaret ###########
#############################
# Importing module and initializing setup
from pycaret.anomaly import *
ano1 = setup(data = df_sub_sub1, session_id=323)# session_id=323===602##, preprocess=True, imputation_type = 'simple',numeric_imputation = 'mean', normalize= True, transformation=True,handle_unknown_categorical = True , remove_multicollinearity=True, high_cardinality_features = True fraction = 0.05,, normalize= True 
# creating a model

iforest = create_model('iforest')
# plotting a model
#plot_model(iforest, plot = 'tsne')#, feature = None, label = False, scale = 1, save False, display_format = None 
In [ ]:
iforest_anomoly_label = assign_model(iforest, score = True, verbose= True) 
iforest_anomoly_label
In [29]:
confusion_matrix = pd.crosstab(iforest_anomoly_label['Anomaly'], iforest_anomoly_label['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)
Predicted      0    1
Actual               
0          19994  864
1           1043   55
In [ ]:
# tuned_iforest = tune_model(model = 'iforest', supervised_target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN')
In [215]:
df_3_sub.columns
Out[215]:
Index(['ME Action Email', 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
       'ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
       'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
       'Sent_to_Open', 'Sent_to_Click', 'Open_to_Click',
       'reference_time_to_Open', 'reference_time_to_Click',
       'reference_time_to_Sent'],
      dtype='object')
In [65]:
df_sub_sub.isnull().sum()
Out[65]:
ME Action Email                              0
BOT_ACTIVITY_FROM_CLICK_OR_OPEN              0
ME_Is_Unique_FROM_CLICK_OR_OPEN              0
ME Browser_FROM_CLICK_OR_OPEN                0
ME Email Client_FROM_CLICK_OR_OPEN           0
ME OperatingSystem_FROM_CLICK_OR_OPEN        0
ME Device_FROM_CLICK_OR_OPEN                 0
Sent_to_Open                                 0
Sent_to_Click                            16704
Open_to_Click                            16704
reference_time_to_Open                       0
reference_time_to_Click                  16704
reference_time_to_Sent                       0
dtype: int64
In [71]:
# isolation forest for imbalanced classification
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import IsolationForest
# define outlier detection model
model = IsolationForest(contamination=0.9, behaviour='new')


## X and y defenition for train_test_split
df_sub_sub = df_3_sub

# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)

df_sub_sub_x = df_sub_sub[['ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
       'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
       'Sent_to_Open' ]]#,, 'Open_to_Click''Sent_to_Click' 'reference_time_to_Open', 'reference_time_to_Click','reference_time_to_Sent'

## One -hot -encoding
one_hot_data = pd.get_dummies(df_sub_sub_x[['ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
       'ME Email Client_FROM_CLICK_OR_OPEN',
       'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN']], drop_first=True, dtype=float)#

# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false',1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around',-1)

X, y = one_hot_data, df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=1)
# # fit on majority class
X_train = X_train[y_train==1]
model.fit(X_train)

# ...
# detect outliers in the test set
yhat = model.predict(X_test)


...
# # mark inliers 1, outliers -1
y_test[y_test == 1] = 1
y_test[y_test == -1] = -1


y_test=y_test.to_numpy()
y_test=y_test.astype(str).astype(int)

# calculate score
score = f1_score(y_test, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)
F1 Score: 0.154
In [88]:
df_sub_sub.
  File "<ipython-input-88-437b144ecb87>", line 1
    df_sub_sub.
               ^
SyntaxError: invalid syntax
In [296]:
df_3_sub.head(1)
Out[296]:
ME Action Email BOT_ACTIVITY_FROM_CLICK_OR_OPEN ME_Is_Unique_FROM_CLICK_OR_OPEN ME Browser_FROM_CLICK_OR_OPEN ME Email Client_FROM_CLICK_OR_OPEN ME OperatingSystem_FROM_CLICK_OR_OPEN ME Device_FROM_CLICK_OR_OPEN Sent_to_Open Sent_to_Click Open_to_Click reference_time_to_Open reference_time_to_Click reference_time_to_Sent
0 81@xs4all.nl false True Safari iPhone iOS iPhone 242.0 NaN NaN 19908474.0 NaN 19908232.0

New data 6+6 Paycaret classification

In [305]:
#df_sub_sub = df_3_sub

df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)

import pycaret
from pycaret.classification import *
exp_clf101 = setup(data = df_3_sub,
                    ignore_features=['ME Action Email'],                             
                    ignore_low_variance= True , combine_rare_levels= True, data_split_stratify = True,
                   remove_multicollinearity= True,  use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #, profile= True, high_cardinality_features=['emailclient', 'operatingsystem'], transformation= True,feature_selection= True,
Description Value
0 session_id 123
1 Target BOT_ACTIVITY_FROM_CLICK_OR_OPEN
2 Target Type Multiclass
3 Label Encoded Bot_click_Honeypot: 0, Bot_click_Honeypot_around: 1, Bot_click_model: 2, Bot_open_model: 3, false: 4
4 Original Data (23534, 13)
5 Missing Values True
6 Numeric Features 6
7 Categorical Features 5
8 Ordinal Features False
9 High Cardinality Features False
10 High Cardinality Method None
11 Transformed Train Set (16473, 31)
12 Transformed Test Set (7061, 31)
13 Shuffle Train-Test True
14 Stratify Train-Test True
15 Fold Generator StratifiedKFold
16 Fold Number 10
17 CPU Jobs -1
18 Use GPU True
19 Log Experiment False
20 Experiment Name clf-default-name
21 USI 5ffe
22 Imputation Type simple
23 Iterative Imputation Iteration None
24 Numeric Imputer mean
25 Iterative Imputation Numeric Model None
26 Categorical Imputer constant
27 Iterative Imputation Categorical Model None
28 Unknown Categoricals Handling least_frequent
29 Normalize False
30 Normalize Method None
31 Transformation False
32 Transformation Method None
33 PCA False
34 PCA Method None
35 PCA Components None
36 Ignore Low Variance True
37 Combine Rare Levels True
38 Rare Level Threshold 0.100000
39 Numeric Binning False
40 Remove Outliers False
41 Outliers Threshold None
42 Remove Multicollinearity True
43 Multicollinearity Threshold 0.900000
44 Remove Perfect Collinearity True
45 Clustering False
46 Clustering Iteration None
47 Polynomial Features False
48 Polynomial Degree None
49 Trignometry Features False
50 Polynomial Threshold None
51 Group Features False
52 Feature Selection False
53 Feature Selection Method classic
54 Features Selection Threshold None
55 Feature Interaction False
56 Feature Ratio False
57 Interaction Threshold None
58 Fix Imbalance False
59 Fix Imbalance Method SMOTE
In [304]:
df_3_sub.shape
Out[304]:
(23534, 13)
In [306]:
best_model = compare_models()
Model Accuracy AUC Recall Prec. F1 Kappa MCC TT (Sec)
rf Random Forest Classifier 0.9636 0.9877 0.8460 0.9641 0.9636 0.8165 0.8170 1.0860
lightgbm Light Gradient Boosting Machine 0.9627 0.9919 0.8312 0.9629 0.9625 0.8104 0.8106 0.7070
et Extra Trees Classifier 0.9599 0.9620 0.8369 0.9608 0.9601 0.7993 0.7996 1.1760
dt Decision Tree Classifier 0.9596 0.9243 0.8445 0.9612 0.9601 0.8001 0.8006 0.0590
gbc Gradient Boosting Classifier 0.9574 0.9879 0.7957 0.9569 0.9563 0.7793 0.7801 8.9020
knn K Neighbors Classifier 0.9482 0.9771 0.7797 0.9496 0.9486 0.7432 0.7437 0.5570
lda Linear Discriminant Analysis 0.8939 0.8328 0.4222 0.8720 0.8823 0.3931 0.3981 0.1250
ridge Ridge Classifier 0.8938 0.0000 0.2000 0.7990 0.8437 -0.0002 -0.0010 0.0410
lr Logistic Regression 0.8919 0.7861 0.2019 0.8063 0.8442 0.0173 0.0468 10.6850
ada Ada Boost Classifier 0.8798 0.8696 0.5224 0.8938 0.8822 0.4541 0.4694 0.6110
qda Quadratic Discriminant Analysis 0.8275 0.7590 0.4733 0.8720 0.8424 0.3185 0.3357 0.0630
svm SVM - Linear Kernel 0.7398 0.0000 0.2109 0.8169 0.7080 0.0035 0.0140 0.4530
nb Naive Bayes 0.6789 0.7840 0.4906 0.8547 0.7422 0.1741 0.2189 0.0240
In [307]:
print(best_model)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=123, verbose=0,
                       warm_start=False)
In [313]:
rf = create_model('rf')
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9715 0.9837 0.8557 0.9712 0.9709 0.8521 0.8527
1 0.9557 0.9819 0.7722 0.9546 0.9546 0.7709 0.7713
2 0.9606 0.9918 0.8009 0.9613 0.9604 0.7993 0.7994
3 0.9696 0.9924 0.8875 0.9706 0.9699 0.8480 0.8481
4 0.9581 0.9821 0.8106 0.9575 0.9576 0.7822 0.7825
5 0.9648 0.9891 0.8593 0.9660 0.9652 0.8268 0.8272
6 0.9715 0.9907 0.8983 0.9733 0.9721 0.8618 0.8628
7 0.9642 0.9899 0.8950 0.9667 0.9650 0.8274 0.8286
8 0.9605 0.9872 0.8522 0.9607 0.9604 0.8019 0.8019
9 0.9599 0.9885 0.8286 0.9596 0.9595 0.7950 0.7952
Mean 0.9636 0.9877 0.8460 0.9641 0.9636 0.8165 0.8170
SD 0.0054 0.0037 0.0403 0.0060 0.0057 0.0296 0.0298
In [316]:
lgb = create_model('lightgbm')
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9654 0.9922 0.8104 0.9644 0.9646 0.8197 0.8203
1 0.9575 0.9909 0.7925 0.9587 0.9572 0.7822 0.7826
2 0.9648 0.9926 0.8253 0.9649 0.9645 0.8199 0.8201
3 0.9690 0.9940 0.8726 0.9688 0.9688 0.8416 0.8417
4 0.9581 0.9893 0.8346 0.9580 0.9579 0.7851 0.7852
5 0.9636 0.9920 0.8117 0.9648 0.9639 0.8185 0.8187
6 0.9648 0.9943 0.8310 0.9647 0.9646 0.8218 0.8218
7 0.9587 0.9909 0.8429 0.9597 0.9589 0.7950 0.7952
8 0.9617 0.9913 0.8352 0.9616 0.9615 0.8070 0.8070
9 0.9630 0.9912 0.8560 0.9636 0.9631 0.8135 0.8136
Mean 0.9627 0.9919 0.8312 0.9629 0.9625 0.8104 0.8106
SD 0.0035 0.0014 0.0220 0.0032 0.0034 0.0174 0.0174
In [317]:
print(lgb)
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)
In [318]:
dt = create_model('dt')
Accuracy AUC Recall Prec. F1 Kappa MCC
0 0.9624 0.9207 0.8257 0.9619 0.9621 0.8080 0.8081
1 0.9557 0.9199 0.8171 0.9574 0.9563 0.7821 0.7825
2 0.9545 0.9149 0.8000 0.9563 0.9547 0.7725 0.7728
3 0.9666 0.9391 0.9001 0.9685 0.9673 0.8366 0.8374
4 0.9532 0.9037 0.8060 0.9547 0.9538 0.7658 0.7660
5 0.9581 0.9259 0.8385 0.9606 0.9591 0.7961 0.7968
6 0.9702 0.9501 0.8918 0.9721 0.9708 0.8548 0.8555
7 0.9605 0.9390 0.8753 0.9619 0.9611 0.8070 0.8075
8 0.9593 0.9169 0.8511 0.9600 0.9595 0.7984 0.7986
9 0.9551 0.9133 0.8391 0.9583 0.9564 0.7801 0.7807
Mean 0.9596 0.9243 0.8445 0.9612 0.9601 0.8001 0.8006
SD 0.0052 0.0135 0.0331 0.0051 0.0052 0.0267 0.0268
In [319]:
print(dt)
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=123, splitter='best')
In [308]:
print(knn)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')
In [314]:
plot_model(rf, plot = 'auc')
In [171]:
df_sub_sub = df_3_sub

df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']

df_sub_sub.shape
Out[171]:
(21956, 12)
In [172]:
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[172]:
false                        21037
Bot_click_Honeypot             470
Bot_click_Honeypot_around      449
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64
In [173]:
df_3_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
Out[173]:
false                        21037
Bot_click_model                929
Bot_open_model                 649
Bot_click_Honeypot             470
Bot_click_Honeypot_around      449
Name: BOT_ACTIVITY_FROM_CLICK_OR_OPEN, dtype: int64

SVM

In [174]:
svm = create_model('svm') 
Initiated . . . . . . . . . . . . . . . . . . 22:22:14
Status . . . . . . . . . . . . . . . . . . Compiling Final Models
Estimator . . . . . . . . . . . . . . . . . . Light Gradient Boosting Machine
Accuracy AUC Recall Prec. F1 Kappa MCC
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-174-7f6da73f8546> in <module>
----> 1 svm = create_model('svm')

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/classification.py in create_model(estimator, fold, round, cross_validation, fit_kwargs, groups, verbose, **kwargs)
    896         groups=groups,
    897         verbose=verbose,
--> 898         **kwargs,
    899     )
    900 

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/internal/tabular.py in create_model_supervised(estimator, fold, round, cross_validation, predict, fit_kwargs, groups, refit, verbose, system, X_train_data, y_train_data, metrics, add_to_model_list, display, **kwargs)
   3188             logger.info("Finalizing model")
   3189             with io.capture_output():
-> 3190                 pipeline_with_model.fit(data_X, data_y, **fit_kwargs)
   3191             model_fit_end = time.time()
   3192 

~/anaconda3/envs/python3/lib/python3.6/site-packages/pycaret/internal/pipeline.py in fit(self, X, y, **fit_kwargs)
    116 
    117     def fit(self, X, y=None, **fit_kwargs):
--> 118         result = super().fit(X, y=y, **fit_kwargs)
    119 
    120         self._carry_over_final_estimator_fit_vars()

~/anaconda3/envs/python3/lib/python3.6/site-packages/imblearn/pipeline.py in fit(self, X, y, **fit_params)
    279                                  self._log_message(len(self.steps) - 1)):
    280             if self._final_estimator != "passthrough":
--> 281                 self._final_estimator.fit(Xt, yt, **fit_params)
    282         return self
    283 

~/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/linear_model/_stochastic_gradient.py in fit(self, X, y, coef_init, intercept_init, sample_weight)
    726                          loss=self.loss, learning_rate=self.learning_rate,
    727                          coef_init=coef_init, intercept_init=intercept_init,
--> 728                          sample_weight=sample_weight)
    729 
    730 

~/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/linear_model/_stochastic_gradient.py in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
    539         X, y = self._validate_data(X, y, accept_sparse='csr',
    540                                    dtype=np.float64, order="C",
--> 541                                    accept_large_sparse=False)
    542 
    543         # labels can be encoded as float, int, or string literals

~/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params)
    430                 y = check_array(y, **check_y_params)
    431             else:
--> 432                 X, y = check_X_y(X, y, **check_params)
    433             out = X, y
    434 

~/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

~/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)
    800                     ensure_min_samples=ensure_min_samples,
    801                     ensure_min_features=ensure_min_features,
--> 802                     estimator=estimator)
    803     if multi_output:
    804         y = check_array(y, accept_sparse='csr', force_all_finite=True,

~/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
     70                           FutureWarning)
     71         kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72         return f(**kwargs)
     73     return inner_f
     74 

~/anaconda3/envs/python3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
    596                     array = array.astype(dtype, casting="unsafe", copy=False)
    597                 else:
--> 598                     array = np.asarray(array, order=order, dtype=dtype)
    599             except ComplexWarning:
    600                 raise ValueError("Complex data not supported\n"

~/anaconda3/envs/python3/lib/python3.6/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
     81 
     82     """
---> 83     return array(a, dtype, copy=False, order=order)
     84 
     85 

~/anaconda3/envs/python3/lib/python3.6/site-packages/pandas/core/generic.py in __array__(self, dtype)
   1779 
   1780     def __array__(self, dtype=None) -> np.ndarray:
-> 1781         return np.asarray(self._values, dtype=dtype)
   1782 
   1783     def __array_wrap__(self, result, context=None):

~/anaconda3/envs/python3/lib/python3.6/site-packages/numpy/core/_asarray.py in asarray(a, dtype, order)
     81 
     82     """
---> 83     return array(a, dtype, copy=False, order=order)
     84 
     85 

ValueError: could not convert string to float: 'NL|2555382|117566|2021-10-14'
In [175]:
svm_anomoly_label = assign_model(svm, score = True, verbose= True) 
svm_anomoly_label
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-175-9e5f88fb5d8e> in <module>
----> 1 svm_anomoly_label = assign_model(svm, score = True, verbose= True)
      2 svm_anomoly_label

NameError: name 'svm' is not defined
In [176]:
confusion_matrix = pd.crosstab(svm_anomoly_label['Anomaly'], svm_anomoly_label['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-176-aa5fa532d1dd> in <module>
----> 1 confusion_matrix = pd.crosstab(svm_anomoly_label['Anomaly'], svm_anomoly_label['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
      2 print (confusion_matrix)

NameError: name 'svm_anomoly_label' is not defined
In [ ]:
 
In [ ]: